[PATCH] Microstate accounting for 2.6.0-test1

From: Peter Chubb (peterc_at_gelato.unsw.edu.au)
Date: 07/22/03

  • Next message: Herbert Pötzl: "Re: where can i find the src of No.0 process?"
    To: linux-kernel@vger.kernel.org
    Date:	Tue, 22 Jul 2003 13:27:50 +1000
    
    

    Hi Folks,
       Here's the latest and greatest Microstate accounting patch, against
    the 2.6.0-test1 kernel as of this morning my time. Tested on IA64 and
    I386.

       When applied, and microstate accounting is enabled, the patch adds
    a new file /proc/pid/msa that contains the time in cycles for each
    state. As a bonus, you get the time spent in each interrupt handler.

    Sample output on a 2.5G P4:

    ; cat /proc/$$/msa
    State: Interruptible
    ONCPU_USER 65290612
    ONCPU_SYS 32495176
    INTERRUPTIBLE 41788648905302
    UNINTERRUPTIBLE 885657552
    INTERRUPTED 2225408
    ACTIVEQUEUE 1539599360
    EXPIREDQUEUE 0
    STOPPED 0
    ZOMBIE 0
    SLP_POLL 0
    SLP_PAGING 0
    SLP_FUTEX 0

    $ cat /proc/interrupts
               CPU0
      0: 77361890 648007534 IO-APIC-edge timer
      2: 0 0 XT-PIC cascade
      3: 0 0 IO-APIC-edge serial
      4: 2301 60917 IO-APIC-edge serial
      8: 4 25 IO-APIC-edge rtc
      9: 14 188 IO-APIC-level acpi
     14: 291288 4067531 IO-APIC-edge ide0
     15: 18 158 IO-APIC-edge ide1
     16: 178246 1652739 IO-APIC-level uhci-hcd
     18: 73399 579808 IO-APIC-level uhci-hcd, eth0
     19: 31 139 IO-APIC-level uhci-hcd
     23: 0 0 IO-APIC-level ehci_hcd
    NMI: 0
    LOC: 77367218
    ERR: 0
    MIS: 0

    (the third column is time in nanoseconds)

    The patch also adds a new system call, which gives scaled (nanosecond)
    numbers.
            msa(n, select, timers)
            int n;
            enum { MSA_SELF, MSA_CHILDREN } select,
            clk_t timers[n]

            fills the n timers with the number of nanoseconds in that
            state, for the current process, or for waited-for children.

    GOTCHAS:
            -- I've measured the overhead as around 5% per context switch,
            but negligeable for real workloads.

            -- Time spent in the kernel as a result of a trap (e.g., when
            paging) is accounted for as user time (which means that time
            reported as ONCPU_SYS is *only* system call time)

            -- If you're on a machine where the CPU frequecny cvhanges,
            the results are useless. You really need Dominik Brodowski's
            patch from
            http://marc.theaimsgroup.com/?l=linux-kernel&m=105860269801212&q=raw
            with corresponding changes to asm-i386/msa.h

    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/arch/i386/Kconfig linux-2.5-ustate/arch/i386/Kconfig
    --- linux-2.5-import/arch/i386/Kconfig Mon Jul 14 11:12:14 2003
    +++ linux-2.5-ustate/arch/i386/Kconfig Mon Jul 14 15:15:29 2003
    @@ -1383,6 +1383,15 @@
             depends on X86_LOCAL_APIC && !X86_VISWS
             default y
     
    +config MICROSTATE
    + bool "Microstate accounting"
    + help
    + This option causes the kernel to keep very accurate track of
    + how long your threads spend on the runqueues, running, or asleep or
    + stopped. It will slow down your kernel.
    + Times are reported in /proc/pid/msa and through a new msa()
    + system call.
    +
     endmenu
     
     source "security/Kconfig"
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/arch/i386/kernel/entry.S linux-2.5-ustate/arch/i386/kernel/entry.S
    --- linux-2.5-import/arch/i386/kernel/entry.S Mon Jul 14 11:12:14 2003
    +++ linux-2.5-ustate/arch/i386/kernel/entry.S Mon Jul 14 15:15:29 2003
    @@ -264,9 +264,17 @@
     
             testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
             jnz syscall_trace_entry
    +#ifdef CONFIG_MICROSTATE
    + pushl %eax
    + call msa_start_syscall
    + popl %eax
    +#endif
             call *sys_call_table(,%eax,4)
             movl %eax,EAX(%esp)
             cli
    +#ifdef CONFIG_MICROSTATE
    + call msa_end_syscall
    +#endif
             movl TI_FLAGS(%ebp), %ecx
             testw $_TIF_ALLWORK_MASK, %cx
             jne syscall_exit_work
    @@ -288,9 +296,17 @@
             testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
             jnz syscall_trace_entry
     syscall_call:
    +#ifdef CONFIG_MICROSTATE
    + pushl %eax
    + call msa_start_syscall
    + popl %eax
    +#endif
             call *sys_call_table(,%eax,4)
             movl %eax,EAX(%esp) # store the return value
     syscall_exit:
    +#ifdef CONFIG_MICROSTATE
    + call msa_end_syscall
    +#endif
             cli # make sure we don't miss an interrupt
                                             # setting need_resched or sigpending
                                             # between sampling and the iret
    @@ -878,5 +894,6 @@
             .long sys_fstatfs64
             .long sys_tgkill /* 270 */
             .long sys_utimes
    + .long sys_msa /* 272 */
      
     nr_syscalls=(.-sys_call_table)/4
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/arch/i386/kernel/irq.c linux-2.5-ustate/arch/i386/kernel/irq.c
    --- linux-2.5-import/arch/i386/kernel/irq.c Tue Jul 8 08:32:12 2003
    +++ linux-2.5-ustate/arch/i386/kernel/irq.c Tue Jul 8 09:19:05 2003
    @@ -157,10 +157,18 @@
                     seq_printf(p, "%3d: ",i);
     #ifndef CONFIG_SMP
                     seq_printf(p, "%10u ", kstat_irqs(i));
    +#ifdef CONFIG_MICROSTATE
    + seq_printf(p, "%10llu", msa_irq_time(0, i));
    +#endif
     #else
                     for (j = 0; j < NR_CPUS; j++)
    - if (cpu_online(j))
    + if (cpu_online(j)) {
                                     seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
    +#ifdef CONFIG_MICROSTATE
    + seq_printf(p, "%10llu", msa_irq_time(j, i));
    +#endif
    + }
    +
     #endif
                     seq_printf(p, " %14s", irq_desc[i].handler->typename);
                     seq_printf(p, " %s", action->name);
    @@ -421,6 +429,7 @@
             unsigned int status;
     
             irq_enter();
    + msa_start_irq(irq);
     
     #ifdef CONFIG_DEBUG_STACKOVERFLOW
             /* Debugging check for stack overflow: is there less than 1KB free? */
    @@ -500,6 +509,7 @@
             spin_unlock(&desc->lock);
     
             irq_exit();
    + msa_finish_irq(irq);
     
             return 1;
     }
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/arch/ia64/Kconfig linux-2.5-ustate/arch/ia64/Kconfig
    --- linux-2.5-import/arch/ia64/Kconfig Mon Jul 14 11:12:15 2003
    +++ linux-2.5-ustate/arch/ia64/Kconfig Mon Jul 14 15:15:29 2003
    @@ -740,6 +740,15 @@
               and restore instructions. It's useful for tracking down spinlock
               problems, but slow! If you're unsure, select N.
     
    +config MICROSTATE
    + bool "Microstate accounting"
    + help
    + This option causes the kernel to keep very accurate track of
    + how long your threads spend on the runqueues, running, or asleep or
    + stopped. It will slow down your kernel.
    + Times are reported in /proc/pid/msa and through a new msa()
    + system call.
    +
     endmenu
     
     source "security/Kconfig"
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/arch/ia64/kernel/entry.S linux-2.5-ustate/arch/ia64/kernel/entry.S
    --- linux-2.5-import/arch/ia64/kernel/entry.S Fri Jul 18 08:27:52 2003
    +++ linux-2.5-ustate/arch/ia64/kernel/entry.S Tue Jul 22 09:28:54 2003
    @@ -539,6 +539,36 @@
             br.cond.sptk strace_save_retval
     END(ia64_trace_syscall)
     
    +#ifdef CONFIG_MICROSTATE
    +GLOBAL_ENTRY(invoke_msa_end_syscall)
    + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
    + alloc loc1=ar.pfs,8,4,0,0
    + mov loc0=rp
    + .body
    + ;;
    + br.call.sptk.many rp=msa_end_syscall
    +1: mov rp=loc0
    + mov ar.pfs=loc1
    + br.ret.sptk.many rp
    +END(invoke_msa_end_syscall)
    +
    +GLOBAL_ENTRY(invoke_msa_start_syscall)
    + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
    + alloc loc1=ar.pfs,8,4,0,0
    + mov loc0=rp
    + .body
    + mov loc2=b6
    + mov loc3=r15
    + ;;
    + br.call.sptk.many rp=msa_start_syscall
    +1: mov rp=loc0
    + mov r15=loc3
    + mov ar.pfs=loc1
    + mov b6=loc2
    + br.ret.sptk.many rp
    +END(invoke_msa_start_syscall)
    +#endif /* CONFIG_MICROSTATE */
    +
     GLOBAL_ENTRY(ia64_ret_from_clone)
             PT_REGS_UNWIND_INFO(0)
     { /*
    @@ -620,6 +650,10 @@
      */
     GLOBAL_ENTRY(ia64_leave_syscall)
             PT_REGS_UNWIND_INFO(0)
    +#ifdef CONFIG_MICROSTATE
    + br.call.sptk.many rp=invoke_msa_end_syscall
    +1:
    +#endif
             /*
              * work.need_resched etc. mustn't get changed by this CPU before it returns to
              * user- or fsys-mode, hence we disable interrupts early on:
    @@ -961,7 +995,7 @@
             mov loc7=0
     (pRecurse) br.call.sptk.few b0=rse_clear_invalid
             ;;
    - mov loc8=0
    +1: mov loc8=0
             mov loc9=0
             cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret
             mov loc10=0
    @@ -1461,7 +1495,7 @@
             data8 sys_clock_nanosleep
             data8 sys_fstatfs64
             data8 sys_statfs64
    - data8 ia64_ni_syscall
    + data8 sys_msa
             data8 ia64_ni_syscall // 1260
             data8 ia64_ni_syscall
             data8 ia64_ni_syscall
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/arch/ia64/kernel/irq_ia64.c linux-2.5-ustate/arch/ia64/kernel/irq_ia64.c
    --- linux-2.5-import/arch/ia64/kernel/irq_ia64.c Fri Jun 27 09:17:14 2003
    +++ linux-2.5-ustate/arch/ia64/kernel/irq_ia64.c Tue Jul 22 09:28:56 2003
    @@ -76,6 +76,7 @@
     ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
     {
             unsigned long saved_tpr;
    + ia64_vector oldvector;
     #ifdef CONFIG_SMP
     # define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE)
     #else
    @@ -119,6 +120,8 @@
              */
             saved_tpr = ia64_get_tpr();
             ia64_srlz_d();
    + oldvector = vector;
    + msa_start_irq(local_vector_to_irq(vector));
             while (vector != IA64_SPURIOUS_INT_VECTOR) {
                     if (!IS_RESCHEDULE(vector)) {
                             ia64_set_tpr(vector);
    @@ -133,7 +136,10 @@
                             ia64_set_tpr(saved_tpr);
                     }
                     ia64_eoi();
    - vector = ia64_get_ivr();
    + oldvector = vector;
    + vector = ia64_get_ivr();
    + msa_continue_irq(local_vector_to_irq(oldvector),
    + local_vector_to_irq(vector));
             }
             /*
              * This must be done *after* the ia64_eoi(). For example, the keyboard softirq
    @@ -142,6 +148,8 @@
              */
             if (local_softirq_pending())
                     do_softirq();
    +
    + msa_finish_irq(local_vector_to_irq(vector));
     }
     
     #ifdef CONFIG_SMP
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/arch/ia64/kernel/ivt.S linux-2.5-ustate/arch/ia64/kernel/ivt.S
    --- linux-2.5-import/arch/ia64/kernel/ivt.S Tue Jun 24 15:12:05 2003
    +++ linux-2.5-ustate/arch/ia64/kernel/ivt.S Tue Jul 22 09:28:56 2003
    @@ -697,6 +697,10 @@
             srlz.i // guarantee that interruption collection is on
             ;;
     (p15) ssm psr.i // restore psr.i
    +#ifdef CONFIG_MICROSTATE
    + br.call.sptk.many rp=invoke_msa_start_syscall
    +1:
    +#endif /* CONFIG_MICROSTATE */
             ;;
             mov r3=NR_syscalls - 1
             movl r16=sys_call_table
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/fs/proc/base.c linux-2.5-ustate/fs/proc/base.c
    --- linux-2.5-import/fs/proc/base.c Mon Jul 14 11:12:20 2003
    +++ linux-2.5-ustate/fs/proc/base.c Tue Jul 22 09:28:57 2003
    @@ -65,6 +65,9 @@
             PROC_PID_ATTR_EXEC,
             PROC_PID_ATTR_FSCREATE,
     #endif
    +#ifdef CONFIG_MICROSTATE
    + PROC_PID_MSA,
    +#endif
             PROC_PID_FD_DIR = 0x8000, /* 0x8000-0xffff */
     };
     
    @@ -95,6 +98,9 @@
     #ifdef CONFIG_KALLSYMS
       E(PROC_PID_WCHAN, "wchan", S_IFREG|S_IRUGO),
     #endif
    +#ifdef CONFIG_MICROSTATE
    + E(PROC_PID_MSA, "msa", S_IFREG|S_IRUGO),
    +#endif
       {0,0,NULL,0}
     };
     #ifdef CONFIG_SECURITY
    @@ -288,6 +294,60 @@
     }
     #endif /* CONFIG_KALLSYMS */
     
    +#ifdef CONFIG_MICROSTATE
    +/*
    + * provides microstate accounting information
    + *
    + */
    +static int proc_pid_msa(struct task_struct *task, char *buffer)
    +{
    + struct microstates *msp = &task->microstates;
    + static char *statenames[] = {
    + "User",
    + "System",
    + "Interruptible",
    + "Uninterruptible",
    + "OnActiveQueue",
    + "OnExpiredQueue",
    + "Zombie",
    + "Stopped",
    + "Paging",
    + "Futex",
    + "Poll",
    + "Interrupted",
    + };
    +
    + return sprintf(buffer,
    + "State: %s\n" \
    + "ONCPU_USER %15llu\n" \
    + "ONCPU_SYS %15llu\n" \
    + "INTERRUPTIBLE %15llu\n" \
    + "UNINTERRUPTIBLE%15llu\n" \
    + "INTERRUPTED %15llu\n" \
    + "ACTIVEQUEUE %15llu\n" \
    + "EXPIREDQUEUE %15llu\n" \
    + "STOPPED %15llu\n" \
    + "ZOMBIE %15llu\n" \
    + "SLP_POLL %15llu\n" \
    + "SLP_PAGING %15llu\n" \
    + "SLP_FUTEX %15llu\n", \
    + msp->cur_state >= 0 && msp->cur_state < NR_MICRO_STATES ?
    + statenames[msp->cur_state] : "Impossible",
    + (unsigned long long)msp->timers[ONCPU_USER],
    + (unsigned long long)msp->timers[ONCPU_SYS],
    + (unsigned long long)msp->timers[INTERRUPTIBLE_SLEEP],
    + (unsigned long long)msp->timers[UNINTERRUPTIBLE_SLEEP],
    + (unsigned long long)msp->timers[INTERRUPTED],
    + (unsigned long long)msp->timers[ONACTIVEQUEUE],
    + (unsigned long long)msp->timers[ONEXPIREDQUEUE],
    + (unsigned long long)msp->timers[STOPPED],
    + (unsigned long long)msp->timers[ZOMBIE],
    + (unsigned long long)msp->timers[POLL_SLEEP],
    + (unsigned long long)msp->timers[PAGING_SLEEP],
    + (unsigned long long)msp->timers[FUTEX_SLEEP]);
    +}
    +#endif /* CONFIG_MICROSTATE */
    +
     /************************************************************************/
     /* Here the fs part begins */
     /************************************************************************/
    @@ -1205,6 +1265,12 @@
                     case PROC_PID_WCHAN:
                             inode->i_fop = &proc_info_file_operations;
                             ei->op.proc_read = proc_pid_wchan;
    + break;
    +#endif
    +#ifdef CONFIG_MICROSTATE
    + case PROC_PID_MSA:
    + inode->i_fop = &proc_info_file_operations;
    + ei->op.proc_read = proc_pid_msa;
                             break;
     #endif
                     default:
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/fs/select.c linux-2.5-ustate/fs/select.c
    --- linux-2.5-import/fs/select.c Fri Jun 13 09:35:59 2003
    +++ linux-2.5-ustate/fs/select.c Tue Jul 8 10:48:29 2003
    @@ -253,6 +253,7 @@
                             retval = table.error;
                             break;
                     }
    + msa_next_state(current, POLL_SLEEP);
                     __timeout = schedule_timeout(__timeout);
             }
             __set_current_state(TASK_RUNNING);
    @@ -443,6 +444,7 @@
                     count = wait->error;
                     if (count)
                             break;
    + msa_next_state(current, POLL_SLEEP);
                     timeout = schedule_timeout(timeout);
             }
             __set_current_state(TASK_RUNNING);
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/include/asm-i386/msa.h linux-2.5-ustate/include/asm-i386/msa.h
    --- linux-2.5-import/include/asm-i386/msa.h Thu Jan 1 10:00:00 1970
    +++ linux-2.5-ustate/include/asm-i386/msa.h Tue Jul 22 12:58:57 2003
    @@ -0,0 +1,29 @@
    +/************************************************************************
    + * asm-i386/msa.h
    + *
    + * Provide an architecture-specific clock.
    + */
    +
    +#ifndef _ASM_I386_MSA_H
    +# define _ASM_I386_MSA_H
    +
    +# ifdef __KERNEL__
    +# include <linux/config.h>
    +
    +# if defined(CONFIG_X86_TSC)
    +# include <asm/msr.h>
    +# include <asm/div64.h>
    +# define MSA_NOW(now) rdtscll(now)
    +
    +extern unsigned long cpu_khz;
    +# define MSA_TO_NSEC(clk) ({ clk_t _x = ((clk) * 1000000ULL); do_div(_x, cpu_khz); _x; })
    +
    +# else
    +unsigned long long monotonic_clock(void);
    +# define MSA_NOW(now) do { now = monotonic_clock(); } while (0)
    +# define MSA_TO_NSEC(clk) (clk)
    +# endif
    +
    +# endif /* _KERNEL */
    +
    +#endif /* _ASM_I386_MSA_H */
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/include/asm-i386/unistd.h linux-2.5-ustate/include/asm-i386/unistd.h
    --- linux-2.5-import/include/asm-i386/unistd.h Mon Jul 14 11:12:20 2003
    +++ linux-2.5-ustate/include/asm-i386/unistd.h Mon Jul 14 15:15:41 2003
    @@ -277,6 +277,7 @@
     #define __NR_fstatfs64 269
     #define __NR_tgkill 270
     #define __NR_utimes 271
    +#define __NR_msa 272
     
     #define NR_syscalls 272
     
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/include/asm-ia64/msa.h linux-2.5-ustate/include/asm-ia64/msa.h
    --- linux-2.5-import/include/asm-ia64/msa.h Thu Jan 1 10:00:00 1970
    +++ linux-2.5-ustate/include/asm-ia64/msa.h Wed Jul 2 09:49:23 2003
    @@ -0,0 +1,21 @@
    +/************************************************************************
    + * asm-ia64/msa.h
    + *
    + * Provide an architecture-specific clock.
    + */
    +
    +#ifndef _ASM_IA64_MSA_H
    +#define _ASM_IA64_MSA_H
    +
    +#ifdef __KERNEL__
    +#include <asm/processor.h>
    +#include <asm/timex.h>
    +#include <asm/smp.h>
    +
    +#define MSA_NOW(now) do { now = (clk_t)get_cycles(); } while (0)
    +
    +#define MSA_TO_NSEC(clk) ((1000000000*clk) / cpu_data(smp_processor_id())->itc_freq)
    +
    +#endif /* _KERNEL */
    +
    +#endif /* _ASM_IA64_MSA_H */
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/include/asm-ia64/unistd.h linux-2.5-ustate/include/asm-ia64/unistd.h
    --- linux-2.5-import/include/asm-ia64/unistd.h Fri Jul 11 10:10:32 2003
    +++ linux-2.5-ustate/include/asm-ia64/unistd.h Mon Jul 14 15:15:41 2003
    @@ -248,10 +248,11 @@
     #define __NR_sys_clock_nanosleep 1256
     #define __NR_sys_fstatfs64 1257
     #define __NR_sys_statfs64 1258
    +#define __NR_sys_msa 1259
     
     #ifdef __KERNEL__
     
    -#define NR_syscalls 256 /* length of syscall table */
    +#define NR_syscalls 271 /* length of syscall table */
     
     #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)
     
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/include/linux/msa.h linux-2.5-ustate/include/linux/msa.h
    --- linux-2.5-import/include/linux/msa.h Thu Jan 1 10:00:00 1970
    +++ linux-2.5-ustate/include/linux/msa.h Tue Jul 22 09:28:58 2003
    @@ -0,0 +1,139 @@
    +/*
    + * msa.h
    + * microstate accouting
    + */
    +
    +#ifndef _LINUX_MSA_H
    +#define _LINUX_MSA_H
    +#include <config/microstate.h>
    +
    +#include <asm/msa.h>
    +
    +typedef __u64 clk_t;
    +
    +extern clk_t msa_last_flip[];
    +
    +/*
    + * Tracked states
    + */
    +
    +enum thread_state {
    + UNKNOWN = -1,
    + ONCPU_USER,
    + ONCPU_SYS,
    + INTERRUPTIBLE_SLEEP,
    + UNINTERRUPTIBLE_SLEEP,
    + ONACTIVEQUEUE,
    + ONEXPIREDQUEUE,
    + ZOMBIE,
    + STOPPED,
    + INTERRUPTED,
    + PAGING_SLEEP,
    + FUTEX_SLEEP,
    + POLL_SLEEP,
    +
    + NR_MICRO_STATES /* Must be last */
    +};
    +
    +#define ONCPU ONCPU_USER /* for now... */
    +
    +/*
    + * Times are tracked for the current task in timers[],
    + * and for the current task's children in child_timers[] (accumulated at wait() time)
    + */
    +struct microstates {
    + enum thread_state cur_state;
    + enum thread_state next_state;
    + int lastqueued;
    + unsigned flags;
    + clk_t last_change;
    + clk_t timers[NR_MICRO_STATES];
    + clk_t child_timers[NR_MICRO_STATES];
    +};
    +
    +/*
    + * Values for microstates.flags
    + */
    +#define QUEUE_FLIPPED (1<<0) /* Active and Expired queues were swapped */
    +#define MSA_SYS (1<<1) /* this task executing in system call */
    +
    +/*
    + * A system call for getting the timers.
    + * The number of timers wanted is passed as argument, in case not all
    + * are needed (and to guard against when we add more timers!)
    + */
    +
    +#define MSA_SELF 0
    +#define MSA_CHILDREN 1
    +
    +
    +#if defined __KERNEL__
    +extern long sys_msa(int ntimers, int which, clk_t *timers);
    +#if defined(CONFIG_MICROSTATE)
    +#include <asm/current.h>
    +#include <asm/irq.h>
    +
    +
    +#define MSA_SOFTIRQ NR_IRQS
    +
    +void msa_init_timer(struct task_struct *task);
    +void msa_switch(struct task_struct *prev, struct task_struct *next);
    +void msa_update_parent(struct task_struct *parent, struct task_struct *this);
    +void msa_init(struct task_struct *p);
    +void msa_set_timer(struct task_struct *p, int state);
    +void msa_start_irq(int irq);
    +void msa_continue_irq(int oldirq, int newirq);
    +void msa_finish_irq(int irq);
    +void msa_start_syscall(void);
    +void msa_end_syscall(void);
    +
    +clk_t msa_irq_time(int cpu, int irq);
    +
    +#ifdef TASK_STRUCT_DEFINED
    +static inline void msa_next_state(struct task_struct *p, enum thread_state next_state)
    +{
    + p->microstates.next_state = next_state;
    +}
    +static inline void msa_flip_expired(struct task_struct *prev) {
    + prev->microstates.flags |= QUEUE_FLIPPED;
    +}
    +
    +static inline void msa_syscall(void) {
    + if (current->microstates.cur_state == ONCPU_USER)
    + msa_start_syscall();
    + else
    + msa_end_syscall();
    +}
    +
    +#else
    +#define msa_next_state(p, s) ((p)->microstates.next_state = (s))
    +#define msa_flip_expired(p) ((p)->microstates.flags |= QUEUE_FLIPPED)
    +#define msa_syscall() do { \
    + if (current->microstates.cur_state == ONCPU_USER) \
    + msa_start_syscall(); \
    + else \
    + msa_end_syscall(); \
    +} while (0)
    +
    +#endif
    +#else /* CONFIG_MICROSTATE */
    +
    +
    +static inline void msa_switch(struct task_struct *prev, struct task_struct *next) { }
    +static inline void msa_update_parent(struct task_struct *parent, struct task_struct *this) { }
    +
    +static inline void msa_init(struct task_struct *p) { }
    +static inline void msa_set_timer(struct task_struct *p, int state) { }
    +static inline void msa_start_irq(int irq) { }
    +static inline void msa_continue_irq(int oldirq, int newirq) { }
    +static inline void msa_finish_irq(int irq) { };
    +
    +static inline clk_t msa_irq_time(int cpu, int irq) { return 0; }
    +static inline void msa_next_state(struct task_struct *p, int s) { }
    +static inline void msa_flip_expired(struct task_struct *p) { }
    +static inline void msa_start_syscall(void) { }
    +static inline void msa_end_syscall(void) { }
    +
    +#endif /* CONFIG_MICROSTATE */
    +#endif /* __KERNEL__ */
    +#endif /* _LINUX_MSA_H */
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/include/linux/sched.h linux-2.5-ustate/include/linux/sched.h
    --- linux-2.5-import/include/linux/sched.h Mon Jul 21 09:10:45 2003
    +++ linux-2.5-ustate/include/linux/sched.h Mon Jul 21 15:06:20 2003
    @@ -28,6 +28,7 @@
     #include <linux/completion.h>
     #include <linux/pid.h>
     #include <linux/percpu.h>
    +#include <linux/msa.h>
     
     struct exec_domain;
     
    @@ -391,6 +392,9 @@
             struct list_head posix_timers; /* POSIX.1b Interval Timers */
             unsigned long utime, stime, cutime, cstime;
             u64 start_time;
    +#ifdef CONFIG_MICROSTATE
    + struct microstates microstates;
    +#endif
     /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
             unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
     /* process credentials */
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/kernel/Makefile linux-2.5-ustate/kernel/Makefile
    --- linux-2.5-import/kernel/Makefile Fri Jun 13 09:37:26 2003
    +++ linux-2.5-ustate/kernel/Makefile Wed Jul 2 10:17:54 2003
    @@ -6,7 +6,8 @@
                 exit.o itimer.o time.o softirq.o resource.o \
                 sysctl.o capability.o ptrace.o timer.o user.o \
                 signal.o sys.o kmod.o workqueue.o pid.o \
    - rcupdate.o intermodule.o extable.o params.o posix-timers.o
    + rcupdate.o intermodule.o extable.o params.o posix-timers.o \
    + msa.o
     
     obj-$(CONFIG_FUTEX) += futex.o
     obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/kernel/exit.c linux-2.5-ustate/kernel/exit.c
    --- linux-2.5-import/kernel/exit.c Tue Jul 8 08:32:12 2003
    +++ linux-2.5-ustate/kernel/exit.c Tue Jul 8 08:58:19 2003
    @@ -81,6 +81,9 @@
             p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
             p->parent->cnswap += p->nswap + p->cnswap;
             sched_exit(p);
    +
    + msa_update_parent(p->parent, p);
    +
             write_unlock_irq(&tasklist_lock);
             spin_unlock(&p->proc_lock);
             proc_pid_flush(proc_dentry);
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/kernel/fork.c linux-2.5-ustate/kernel/fork.c
    --- linux-2.5-import/kernel/fork.c Mon Jul 21 09:10:45 2003
    +++ linux-2.5-ustate/kernel/fork.c Mon Jul 21 15:06:20 2003
    @@ -792,6 +792,7 @@
     #endif
             p->did_exec = 0;
             p->state = TASK_UNINTERRUPTIBLE;
    + msa_init(p);
     
             copy_flags(clone_flags, p);
             if (clone_flags & CLONE_IDLETASK)
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/kernel/futex.c linux-2.5-ustate/kernel/futex.c
    --- linux-2.5-import/kernel/futex.c Fri Jun 13 09:37:27 2003
    +++ linux-2.5-ustate/kernel/futex.c Tue Jul 8 10:48:30 2003
    @@ -34,6 +34,7 @@
     #include <linux/futex.h>
     #include <linux/vcache.h>
     #include <linux/mount.h>
    +#include <linux/msa.h>
     
     #define FUTEX_HASHBITS 8
     
    @@ -349,6 +350,7 @@
              * the waiter from the list.
              */
             add_wait_queue(&q.waiters, &wait);
    + msa_next_state(current, FUTEX_SLEEP);
             set_current_state(TASK_INTERRUPTIBLE);
             if (!list_empty(&q.list)) {
                     unlock_futex_mm();
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/kernel/msa.c linux-2.5-ustate/kernel/msa.c
    --- linux-2.5-import/kernel/msa.c Thu Jan 1 10:00:00 1970
    +++ linux-2.5-ustate/kernel/msa.c Tue Jul 22 09:28:58 2003
    @@ -0,0 +1,333 @@
    +/*
    + * Microstate accounting.
    + * Try to account for various states much more accurately than
    + * the normal code does.
    + */
    +
    +
    +#include <config/microstate.h>
    +#include <linux/types.h>
    +#include <linux/errno.h>
    +#include <linux/linkage.h>
    +#ifdef CONFIG_MICROSTATE
    +#include <asm/irq.h>
    +#include <asm/hardirq.h>
    +#include <linux/sched.h>
    +#include <asm/uaccess.h>
    +#include <linux/jiffies.h>
    +
    +static clk_t queueflip_time[NR_CPUS];
    +
    +clk_t msa_irq_times[NR_CPUS][NR_IRQS + 1];
    +clk_t msa_irq_entered[NR_CPUS][NR_IRQS + 1];
    +int msa_irq_pids[NR_CPUS][NR_IRQS + 1];
    +
    +/*
    + * Switch from one task to another.
    + * The retiring task is coming off the processor;
    + * the new task is about to run on the processor.
    + *
    + * Update the time in both.
    + *
    + * We'll eventually account for user and sys time separately.
    + * For now, they're both accumulated into ONCPU_USER.
    + */
    +void
    +msa_switch(struct task_struct *prev, struct task_struct *next)
    +{
    + struct microstates *msprev = &prev->microstates;
    + struct microstates *msnext = &next->microstates;
    + clk_t now;
    + enum thread_state next_state;
    + int interrupted = msprev->cur_state == INTERRUPTED;
    +
    + MSA_NOW(now);
    +
    + if (msprev->flags & QUEUE_FLIPPED) {
    + queueflip_time[smp_processor_id()] = now;
    + msprev->flags &= ~QUEUE_FLIPPED;
    + }
    +
    + /*
    + * If the queues have been flipped,
    + * update the state as of the last flip time.
    + */
    + if (msnext->cur_state == ONEXPIREDQUEUE) {
    + msnext->cur_state = ONACTIVEQUEUE;
    + msnext->timers[ONEXPIREDQUEUE] += queueflip_time[msnext->lastqueued] - msnext->last_change;
    + msnext->last_change = queueflip_time[msnext->lastqueued];
    + }
    +
    + msprev->timers[msprev->cur_state] += now - msprev->last_change;
    + msnext->timers[msnext->cur_state] += now - msnext->last_change;
    +
    + /* Update states */
    + switch (msprev->next_state) {
    + case UNKNOWN: /*
    + * Infer from actual state
    + */
    + switch (prev->state) {
    + case TASK_INTERRUPTIBLE:
    + next_state = INTERRUPTIBLE_SLEEP;
    + break;
    +
    + case TASK_UNINTERRUPTIBLE:
    + next_state = UNINTERRUPTIBLE_SLEEP;
    + break;
    +
    + case TASK_STOPPED:
    + next_state = STOPPED;
    + break;
    +
    + case TASK_ZOMBIE:
    + next_state = ZOMBIE;
    + break;
    +
    + case TASK_DEAD:
    + next_state = ZOMBIE;
    + break;
    +
    + case TASK_RUNNING:
    + next_state = ONACTIVEQUEUE;
    + break;
    +
    + default:
    + next_state = UNKNOWN;
    + break;
    +
    + }
    + break;
    +
    + case PAGING_SLEEP: /*
    + * Sleep states are PAGING_SLEEP;
    + * others inferred from task state
    + */
    + switch(prev->state) {
    + case TASK_INTERRUPTIBLE: /* FALLTHROUGH */
    + case TASK_UNINTERRUPTIBLE:
    + next_state = PAGING_SLEEP;
    + break;
    +
    + case TASK_STOPPED:
    + next_state = STOPPED;
    + break;
    +
    + case TASK_ZOMBIE:
    + next_state = ZOMBIE;
    + break;
    +
    + case TASK_DEAD:
    + next_state = ZOMBIE;
    + break;
    +
    + case TASK_RUNNING:
    + next_state = ONACTIVEQUEUE;
    + break;
    +
    + default:
    + next_state = UNKNOWN;
    + break;
    + }
    + break;
    +
    + default: /* Explicitly set next state */
    + next_state = msprev->next_state;
    + msprev->next_state = UNKNOWN;
    + break;
    + }
    +
    + msprev->cur_state = next_state;
    + msprev->last_change = now;
    + msprev->lastqueued = smp_processor_id();
    +
    + msnext->cur_state = interrupted ? INTERRUPTED : (
    + msnext->flags & MSA_SYS ? ONCPU_SYS : ONCPU_USER);
    + msnext->last_change = now;
    +}
    +
    +/*
    + * Initialise the struct microstates in a new task (called from copy_process())
    + */
    +void msa_init(struct task_struct *p)
    +{
    + struct microstates *msp = &p->microstates;
    +
    + memset(msp, 0, sizeof *msp);
    + MSA_NOW(msp->last_change);
    + msp->cur_state = UNINTERRUPTIBLE_SLEEP;
    +}
    +
    +static void inline __msa_set_timer(struct microstates *msp, int next_state)
    +{
    + clk_t now;
    +
    + MSA_NOW(now);
    + msp->timers[msp->cur_state] += now - msp->last_change;
    + msp->last_change = now;
    + msp->cur_state = next_state;
    +
    +}
    +
    +/*
    + * Time stamp an explicit state change (called, e.g., from __activate_task())
    + */
    +void
    +msa_set_timer(struct task_struct *p, int next_state)
    +{
    + struct microstates *msp = &p->microstates;
    +
    + __msa_set_timer(msp, next_state);
    + msp->lastqueued = smp_processor_id();
    + msp->next_state = UNKNOWN;
    +}
    +
    +/*
    + * Helper routines, to be called from assembly language stubs
    + */
    +void msa_start_syscall(void)
    +{
    + struct microstates *msp = &current->microstates;
    +
    + __msa_set_timer(msp, ONCPU_SYS);
    + msp->flags |= MSA_SYS;
    +}
    +
    +void msa_end_syscall(void)
    +{
    + struct microstates *msp = &current->microstates;
    +
    + __msa_set_timer(msp, ONCPU_USER);
    + msp->flags &= ~MSA_SYS;
    +}
    +
    +/*
    + * Accumulate child times into parent, after zombie is over.
    + */
    +void msa_update_parent(struct task_struct *parent, struct task_struct *this)
    +{
    + enum thread_state s;
    + clk_t *msp = parent->microstates.child_timers;
    + struct microstates *mp = &this->microstates;
    + clk_t *msc = mp->timers;
    + clk_t *msgc = mp->child_timers;
    + clk_t now;
    +
    + /*
    + * State could be ZOMBIE (if parent is interested)
    + * or something else (if the parent isn't interested)
    + */
    + MSA_NOW(now);
    + msc[mp->cur_state] += now - mp->last_change;
    +
    + for (s = 0; s < NR_MICRO_STATES; s++) {
    + *msp++ += *msc++ + *msgc++;
    + }
    +}
    +
    +void msa_start_irq(int irq)
    +{
    + struct task_struct *p = current;
    + struct microstates *mp = &p->microstates;
    + clk_t now;
    + int cpu = smp_processor_id();
    +
    + MSA_NOW(now);
    + mp->timers[mp->cur_state] += now - mp->last_change;
    + mp->last_change = now;
    + mp->cur_state = INTERRUPTED;
    +
    + msa_irq_entered[cpu][irq] = now;
    + /* DEBUGGING */
    + msa_irq_pids[cpu][irq] = current->pid;
    +}
    +
    +void msa_continue_irq(int oldirq, int newirq)
    +{
    + clk_t now;
    + int cpu = smp_processor_id();
    + MSA_NOW(now);
    +
    + msa_irq_times[cpu][oldirq] += now - msa_irq_entered[cpu][oldirq];
    + msa_irq_entered[cpu][newirq] = now;
    + msa_irq_pids[cpu][newirq] = current->pid;
    +}
    +
    +
    +void msa_finish_irq(int irq)
    +{
    + struct task_struct *p = current;
    + struct microstates *mp = &p->microstates;
    + clk_t now;
    + int cpu = smp_processor_id();
    +
    + MSA_NOW(now);
    +
    + /*
    + * Interrupts can nest.
    + * Set current state to ONCPU
    + * iff we're not in a nested interrupt.
    + */
    + if (irq_count() == 0) {
    + mp->timers[mp->cur_state] += now - mp->last_change;
    + mp->last_change = now;
    + mp->cur_state = ONCPU_USER;
    + }
    + msa_irq_times[cpu][irq] += now - msa_irq_entered[cpu][irq];
    +
    +}
    +
    +/* return interrupt handling duration in microseconds */
    +clk_t msa_irq_time(int cpu, int irq)
    +{
    + clk_t x = MSA_TO_NSEC(msa_irq_times[cpu][irq]);
    + do_div(x, 1000);
    + return x;
    +}
    +
    +/*
    + * The msa system call --- get microstate data for self or waited-for children.
    + */
    +long asmlinkage sys_msa(int ntimers, int which, clk_t __user *timers)
    +{
    + clk_t now;
    + clk_t *tp;
    + int i;
    + struct microstates *msp = &current->microstates;
    +
    + switch (which) {
    + case MSA_SELF:
    + case MSA_CHILDREN:
    + break;
    + default:
    + return -EINVAL;
    + }
    +
    + if (ntimers > NR_MICRO_STATES)
    + ntimers = NR_MICRO_STATES;
    +
    + if (which == MSA_SELF) {
    + BUG_ON(msp->cur_state != ONCPU_USER);
    +
    + if (ntimers > 0) {
    + MSA_NOW(now);
    + /* Should be ONCPU_SYS */
    + msp->timers[ONCPU_USER] += now - msp->last_change;
    + msp->last_change = now;
    + }
    + }
    +
    + tp = which == MSA_SELF ? msp->timers : msp->child_timers;
    + for (i = 0; i < ntimers; i++) {
    + clk_t x = MSA_TO_NSEC(*tp++);
    + if (copy_to_user(timers++, &x, sizeof x))
    + return -EFAULT;
    + }
    + return 0L;
    +}
    +
    +#else
    +asmlinkage long sys_msa(int ntimers, __u64 *timers)
    +{
    + return -ENOSYS;
    +}
    +#endif
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/kernel/sched.c linux-2.5-ustate/kernel/sched.c
    --- linux-2.5-import/kernel/sched.c Mon Jul 21 09:10:45 2003
    +++ linux-2.5-ustate/kernel/sched.c Mon Jul 21 15:06:20 2003
    @@ -335,6 +335,7 @@
      */
     static inline void __activate_task(task_t *p, runqueue_t *rq)
     {
    + msa_set_timer(p, ONACTIVEQUEUE);
             enqueue_task(p, rq->active);
             nr_running_inc(rq);
     }
    @@ -558,6 +559,7 @@
             if (unlikely(!current->array))
                     __activate_task(p, rq);
             else {
    + msa_set_timer(p, ONACTIVEQUEUE);
                     p->prio = current->prio;
                     list_add_tail(&p->run_list, &current->run_list);
                     p->array = current->array;
    @@ -1241,6 +1243,7 @@
                     if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                             if (!rq->expired_timestamp)
                                     rq->expired_timestamp = jiffies;
    + msa_next_state(p, ONEXPIREDQUEUE);
                             enqueue_task(p, rq->expired);
                     } else
                             enqueue_task(p, rq->active);
    @@ -1324,6 +1327,7 @@
                     rq->expired = array;
                     array = rq->active;
                     rq->expired_timestamp = 0;
    + msa_flip_expired(prev);
             }
     
             idx = sched_find_first_bit(array->bitmap);
    @@ -1339,6 +1343,8 @@
                     rq->nr_switches++;
                     rq->curr = next;
     
    + msa_switch(prev, next);
    +
                     prepare_arch_switch(rq, next);
                     prev = context_switch(rq, prev, next);
                     barrier();
    @@ -1994,6 +2000,7 @@
              */
             if (likely(!rt_task(current))) {
                     dequeue_task(current, array);
    + msa_next_state(current, ONEXPIREDQUEUE);
                     enqueue_task(current, rq->expired);
             } else {
                     list_del(&current->run_list);
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/mm/filemap.c linux-2.5-ustate/mm/filemap.c
    --- linux-2.5-import/mm/filemap.c Mon Jul 14 11:12:20 2003
    +++ linux-2.5-ustate/mm/filemap.c Tue Jul 22 09:28:58 2003
    @@ -273,8 +273,11 @@
             do {
                     prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
                     if (test_bit(bit_nr, &page->flags)) {
    + msa_next_state(current, PAGING_SLEEP);
                             sync_page(page);
    + msa_next_state(current, PAGING_SLEEP);
                             io_schedule();
    + msa_next_state(current, UNKNOWN);
                     }
             } while (test_bit(bit_nr, &page->flags));
             finish_wait(waitqueue, &wait);
    diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5-import/mm/memory.c linux-2.5-ustate/mm/memory.c
    --- linux-2.5-import/mm/memory.c Mon Jul 14 11:12:20 2003
    +++ linux-2.5-ustate/mm/memory.c Tue Jul 22 09:40:21 2003
    @@ -1490,16 +1490,20 @@
     
             entry = *pte;
             if (!pte_present(entry)) {
    + int ret;
    +
                     /*
                      * If it truly wasn't present, we know that kswapd
                      * and the PTE updates will not touch it later. So
                      * drop the lock.
                      */
                     if (pte_none(entry))
    - return do_no_page(mm, vma, address, write_access, pte, pmd);
    - if (pte_file(entry))
    - return do_file_page(mm, vma, address, write_access, pte, pmd);
    - return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
    + ret = do_no_page(mm, vma, address, write_access, pte, pmd);
    + else if (pte_file(entry))
    + ret = do_file_page(mm, vma, address, write_access, pte, pmd);
    + else
    + ret = do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
    + return ret;
             }
     
             if (write_access) {
    @@ -1532,6 +1536,7 @@
             if (is_vm_hugetlb_page(vma))
                     return VM_FAULT_SIGBUS; /* mapping truncation does this. */
     
    + msa_next_state(current, PAGING_SLEEP);
             /*
              * We need the page table lock to synchronize with kswapd
              * and the SMP-safe atomic PTE updates.
    @@ -1541,10 +1546,14 @@
     
             if (pmd) {
                     pte_t * pte = pte_alloc_map(mm, pmd, address);
    - if (pte)
    - return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
    + if (pte) {
    + int ret = handle_pte_fault(mm, vma, address, write_access, pte, pmd);
    + msa_next_state(current, UNKNOWN);
    + return ret;
    + }
             }
             spin_unlock(&mm->page_table_lock);
    + msa_next_state(current, UNKNOWN);
             return VM_FAULT_OOM;
     }
     
    -
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/


  • Next message: Herbert Pötzl: "Re: where can i find the src of No.0 process?"

    Relevant Pages