Clemson University -- CPSC 231 -- Fall 2009 Subroutines reasons for subroutines - repeat same code, or similar code with slightly different parameters - hide design decisions or design complexity - partition off code likely to change - provide for separate compilation - provide for reuse "open" subroutine = macro, resolved before assembly by textual substitution, body of routine is placed in-line at call site with parameter substitution (highly crafted macro example is cmul in appendix B of text) "closed" subroutine = standard notion, branch/execute/return at run time, one copy of body which accesses its formal parameters actual parameters - values or addresses passed to subroutine formal parameters - parameter names appearing in subroutine subroutine calls and returns are LIFO, and older processors typically save the return address of a caller by pushing it onto a memory stack older processors typically pass parameters by pushing them onto the same memory stack that holds return addresses Stack space on the stack can also be allocated for saving caller's registers and for local variables for the called subroutine (esp. important for recursive subroutines), this space is termed a "stack frame" sp - stack pointer, top of stack fp - frame pointer, anchors the stack frame for this subroutine invocation stack grows upwards in this picture | | +------------+ sp -->| ... | | locals | fp-c is usually an address of a local variable | ... | +------------+ fp -->| old fp | | saved regs | | rtn addr | +------------+ | ... | | parameters | fp+k is usually an address of a parameter | ... | +------------+ | | typical actions (see stack diagram above - built from bottom up) caller: /* pre-call */ push parameter2 on stack push parameter1 on stack call subroutine /* pushes return address onto the stack */ subroutine: /* prologue */ push registers to save /* callee save */ push old fp set new fp as current sp subtract from sp to make room for locals /* body */ /* epilogue */ set sp to current fp pop fp pop registers to restore return /* pops return address from stack */ /* post-call */ clean parameters off stack by adding to sp x86 example output for swap routine in C (sp called esp, fp called ebp) void main() { main: void swap(); pushl %ebp ! save old bp int a,b; movl %esp,%ebp ! set new bp as current sp a = 5; b = 44; subl $8,%esp ! sub for a and b swap(&a,&b); movl $5,-4(%ebp) ! initialize a } movl $44,-8(%ebp) ! initialize b leal -8(%ebp),%eax ! form addr of b pushl %eax ! push onto stack leal -4(%ebp),%eax ! form addr of a pushl %eax ! push onto stack call swap ! call addl $8,%esp ! clean parms off stack leave ret void swap(x,y) swap: int *x,*y; pushl %ebp ! save old bp { movl %esp,%ebp ! set new bp as current sp int temp; subl $4,%esp ! sub for temp temp = *x; movl 8(%ebp),%eax ! move addr x into eax *x = *y; movl (%eax),%edx ! indirectly get value in a *y = temp; movl %edx,-4(%ebp) ! store into temp return; movl 8(%ebp),%eax ! move addr x into eax } movl 12(%ebp),%edx ! move addr y into edx movl (%edx),%ecx ! indirectly get value in b movl %ecx,(%eax) ! store indirectly into a movl 12(%ebp),%eax ! move addr y into eax movl -4(%ebp),%edx ! move temp into edx movl %edx,(%eax) ! store indirectly into b leave ret handling a return value is not illustrated above subroutine passes value back, typically through a register caller copies value Parameter passing passing data to/from a subroutine can be done through the parameters and through the return value of a function subroutine parameter passing methods include: call-by-value - input parameters (declared with "IN") in Ada default method for parameters in C, C++, and Pascal parameters of primitive type in Java call-by-result - output parameters (declared with "OUT") in Ada call-by-value-result - in/out parameters (declared with "IN OUT") in Ada call-by-reference - large array parameters in Ada array parameters in C and C++ reference parameters (declared with "&") in C++ reference parameters (declared with "VAR") in Pascal object parameters in Java default method for parameters in Fortran call-by-value - copy values of actual parameters into memory locations of formal parameters before executing the body of the subroutine; do nothing on return main a = 1 a: 1 b = 2 b: 2 call subr(a,b) pass 1,2 via stack print a,b print 1,2 subr(x,y) copy 1,2 into x,y ^ x = x + 1 x: /1/ 2 | y = x + y y: /2/ 4 | return ---------------------' call-by-result - do nothing prior to executing the body of the subroutine; copy the final values of the formal parameters into the memory locations of the actual parameters on return main a = 1 a: 1 b = 2 b: 2 call subr(a,b) pass nothing receive ?,? from subr into a,b a: /1/ ? b: /2/ ? print a,b print ?,? subr(x,y) copy ?,? into x,y ^ x = x + 1 x: /?/ ? | y = x + y y: /?/ ? | return pass ?,? back via stack --' call-by-value-result - perform copying of values both before and after executing the body of the subroutine main a = 1 a: 1 b = 2 b: 2 call subr(a,b) pass 1,2 via stack receive 2,4 from subr into a,b a: /1/ 2 b: /2/ 4 print a,b print 2,4 subr(x,y) copy 1,2 into x,y ^ x = x + 1 x: /1/ 2 | y = x + y y: /2/ 4 | return pass 2,4 back via stack --' call-by-reference - pass the addresses of the actual parameters; copy these addresses into the memory locations of the formal parameters; on each reference to a formal parameter in the body of the subroutine, perform an indirect reference to the corresponding actual parameter; that is, the formal parameter is an alias of the actual parameter, thus both the formal and actual parameter "name" refer to the same object and changes made using the formal parameter are being executed on the object passed as the actual parameter main a = 1 a: 1 a: /1/ 2 action in subr b = 2 b: 2 b: /2/ 4 action in subr call subr(a,b) pass &a,&b via stack print a,b print 2,4 subr(x,y) copy &a,&b into x,y ^ x = x + 1 x: &a thus a = a + 1 | y = x + y y: &b thus b = a + b | return ------------------------' tradeoffs advantages disadvantages call-by-value loads and stores operate copying overhead directly on formal parms call-by-reference no copying, indirect reference through allows subr to change the addresses in formal parms values of the actual to actual parms (i.e., you parms have to chase pointers) optimizations can pass parameters in registers (not using stack) can sometimes not save/restore registers when executing a leaf routine [language note: you may come across call-by-name; it is an obsolete and complex parameter passing mechanism proposed for Algol 60; John Levine writes in comp.compilers in April 2009: "Alan Perlis, who was on the Algol 60 committee, told me that call by name was a mistake. They were trying to make an elegant definition of call by reference, and didn't realize what they'd done until Jensen's Device came along."] [language note: Dr. Murali Sitaraman of Clemson advocates call-by-swapping since it is a avoids aliasing; see www.nvc.vt.edu/gregwk/tako/swapping.html] SPARC registers SPARC designers wanted to eliminate as many memory accesses as possible generic subroutine call above requires at least four memory accesses 1. the return address (pc) is pushed by the call instruction 2. the old fp is pushed at the top of the subroutine 3. the old fp is popped at the bottom of the subroutine, and 4. the return address is popped by the return instruction then there are two memory accesses for each register saved (push to save and then pop to restore). the SPARC designers provided overlapped register windows instead save instruction - advances register window to allocate a new set of regs. restore instruction - restores old register window CPU +--------------------+ +-------+ main memory | +---+ | |.......| | | | register | | insts.| | | | windows | bus |.......| | |...| |=============| data | | | |<-----. | |.......| | |...| | | | heap | | | | | | |.......| | | | | PSR | | stack | | | | +---|-----+ | |.......| | | | |...CWP...| | | | | +---+ +---------+ | | | +--------------------+ | | | | the current register window is | | selected by the current window | | pointer (CWP) in the processor | | status register (PSR) | | +-------+ unless one of the following cases occurs, there are no memory accesses required by a subroutine call and return (a) the subroutine nesting level exceeds the depth of the windows (b) the number of parameters exceeds six (c) access to memory-allocated data or to a local variable that cannot be register-allocated windows before save after save . . . . . . . . save instruction changes . . . . the register mapping (CWP) +-----+ . . %i0-%i7 | | . . restore instruction changes %l0-%l7 | | . . back to previous mapping +-----+ +-----+ %o0-%o7 | |<->%i0-%i7 | | on window overflow, registers |/////| %l0-%l7 | | be written into memory +-----+ +-----+ . . %o0-%o7 | | . . |/////| . . +-----+ . . . . . . . . . . . . +-----+ +-----+ %g0-%g7 | | | | +-----+ +-----+ register usage conventions %i0 -- incoming parameter, also used to pass return value back %i1-%i5 -- additional incoming parameters %i6 -- frame pointer %i7 -- address of call instruction (return to %i7 + 8 if save) %l0-%l7 -- all yours %o0 -- all yours, often used as temp register, but also used to pass a parameter to and get a return value back from a subroutine %o1-%o5 -- all yours, but also used to pass parameters to a subroutine %o6 -- stack pointer %o7 -- address of call instruction (return to %o7 + 8 if no save) before save after save +--------+ . . . . .. %i7 | | . .\ %i6 | old fp | . . | subroutine cannot access these %i5 | | . . | registers of the calling program %i4 | | . . | %i3 | | . . | %i2 | | . . | %i1 | | . . | %i0 | | . . | +--------+ . . . . .. | %l7 | | . . | %l6 | | . . | %l5 | | . . | %l4 | | . . | %l3 | | . . | %l2 | | . . | %l1 | | . . | %l0 | | . ./ +--------+ +--------+ %o7 | & call | <-> %i7 | & call |-- ret inst. will return to %i7 + 8 %o6 | old sp | <-> %i6 | new fp |-- old sp becomes new fp %o5 | parm 6 | <-> %i5 | parm 6 |\ %o4 | parm 5 | <-> %i4 | parm 5 | | %o3 | parm 4 | <-> %i3 | parm 4 | | %o2 | parm 3 | <-> %i2 | parm 3 | | %o1 | parm 2 | <-> %i1 | parm 2 | | up to six parameters %o0 | parm 1 | <-> %i0 | parm 1 |/ +--------+ +--------+ . . %l7 | |\ . . %l6 | | | fresh set of local registers . . %l5 | | | for use by subroutine without . . %l4 | | | need for memory traffic . . %l3 | | | . . %l2 | | | . . %l1 | | | . . %l0 | |/ . . . . .. +--------+ %o7 is available for saved address if . . %o7 | |-- subroutine has any nested calls . . %o6 | new sp |-- new sp = old sp + value in save inst. . . %o5 | |\ . . %o4 | | | . . %o3 | | | available for parameters if . . %o2 | | | subroutine has any nested calls . . %o1 | | | . . %o0 | |/ . . . . .. +--------+ typical call /* pre-call */ move parameter1 to %o0 move parameter2 to %o1 call subroutine /* places address of call instruction in %o7 */ nop /* call requires a delay slot instruction */ subroutine: /* prologue */ save %sp,-96,%sp /* body */ /* epilogue */ ret /* return to %i7+8 */ restore /* post-call */ /* empty */ leaf procedure can use %g1-%g4 and %o0-%o5 without having to save registers /* pre-call */ move parameter1 to %o0 move parameter2 to %o1 call leaf_subroutine /* places address of call instruction in %o7 */ nop /* call requires a delay slot instruction */ leaf_subroutine: /* body */ /* epilogue */ retl /* return to %o7+8 */ nop /* retl requires a delay slot */ /* post-call */ /* empty */ notes call can have any 30-bit value for the displacement; thus, it can branch to any word in memory call could be viewed as a hardcoded jmpl pc,(30_bit_disp<<2),%o7 synthetic forms of jmpl call %o0 = jmpl %o0,%o7 - indirect call; subroutine address in %o0 ret = jmpl %i7,8,%g0 - change pc to contents of %i7 + 8, no link retl = jmpl %o7,8,%g0 - change pc to contents of %o7 + 8, no link example of parameter passing subroutine of the form int sum( int a, int b ){ return ( a + b ); } call of the form c = sum( 1, 2 ); after the call instruction places its address in %o7 ... + - + . . \ . . cannot access these ... | . . + - + | . . . . | ... . . / ... +---+ mov 1,%o0 %i7 | | \ mov 2,%o1 %i6 |ofp| | CURRENT REGISTER WINDOW --> x: call sum ... | nop %i1 | | | (let caller's frame pointer ... will place %i0 | 5 | | value be known as ofp, for return value +---+ | old frame pointer, and let in c ... %l7 | | | caller's stack pointer value %l6 | | | be known as osp, for old ... | stack pointer; 5 and 9 stand %l1 | | | for arbitrary values in the %l0 | 9 | | caller's %i0 and %l0 registers) +---+ | sum: save %sp,-96,%sp %o7 | x | | (the %o_ registers will overlap mov %i0,%l0 %o6 |osp| | so you place the parameters mov %i1,%l1 ... | and "return address" there -- add %l0,%l1,%l2 %o1 | 2 | | actually it is the address of mov %l2,%i0 %o0 | 1 | / the call instruction) ret +---+ restore . . \ . . cannot access these ... | . . + - + | . . . . | ... . . / + - + ... after the add instruction in the subroutine ... + - + . . \ . . cannot access these ... | . . + - + | . . . . | ... . . | ... + - + mov 1,%o0 . . | (caller's %i_ and %l_ registers mov 2,%o1 .ofp. cannot be accessed and thus x: call sum ... | will not be changed during nop . . execution of the subroutine) ... . 5 . | + - + . . | . . ... | . . . 9 . / +---+ sum: save %sp,-96,%sp %i7 | x | \ CURRENT REGISTER WINDOW mov %i0,%l0 %i6 |nfp| | (nfp is same value as osp) mov %i1,%l1 ... | --> add %l0,%l1,%l2 %i1 | 2 | | (the new %i_ registers overlap mov %l2,%i0 %i0 | 1 | | with caller's %o_ registers) ret +---+ | restore %l7 | | | (will use these local registers %l6 | | | without disturbing caller's ... | local registers) %l2 | 3 | | %l1 | 2 | | %l0 | 1 | | +---+ | %o7 | | | %o6 |nsp| | (new stack pointer value is ... | nsp = osp - 96) %o1 | | | %o0 | | / +---+ ... after the restore instruction in the subroutine ... + - + . . \ . . cannot access these ... | . . + - + | . . . . | ... . . / ... +---+ mov 1,%o0 %i7 | | \ mov 2,%o1 %i6 |ofp| | CURRENT REGISTER WINDOW x: call sum ... | nop %i1 | | | (caller's registers ... %i0 | 5 | | have been restored) +---+ | %l7 | | | %l6 | | | ... | %l1 | | | %l0 | 9 | | +---+ | sum: save %sp,-96,%sp %o7 | x | | (the subroutine's %i_ registers mov %i0,%l0 %o6 |osp| | are the caller's %o_ registers; mov %i1,%l1 ... | return value is found in %o0) add %l0,%l1,%l2 %o1 | 2 | | mov %l2,%i0 %o0 | 3 | / ret +---+ --> restore . . \ . . cannot access these any longer ... | . 3 . . 2 . | . 1 . + - + | . . .nsp. | ... . . / + - + ... SPARC stack frame in memory needed for use in window overflow and for memory-allocated local variables +------------+ stack must be doubleword aligned %sp -->| ... | | space for | | reg window | | ... | +------------+ | rtn struct |<-- %sp+64 expected size of return struct will +------------+ be placed in line after call | ... |<-- %sp+68 | space for | more space allocated (below %sp+92) | %i0-%i5 | whenever there are more than six | ... | parameters in a call from this frame +------------+ | pad |<-- %sp+92 (input parameters stored in %fp+68, +------------+ %fp+72, ..., %fp+88, and extra parms | ... |<-- %sp+96 in %fp+92, ... => in caller's frame; | locals | caller stores extra parms in %sp+92, | ... | ...) +------------+ %fp --> (%fp-4 is addr of last word in frame and would usually be the address of a local variable) (an extra 16 bytes is typically added by gcc to the size of the locals to support a move between the floating-point registers and integer registers -- enough space to hold a quad-precision floating-point number) generated assembly code for SPARC from gcc (no optimization) note call by reference (explicit use of & and * in C code) note that for ease of tracing the stack while debugging, all parameters are copied to the stack by the subroutine void main() { gcc2_compiled.: void swap(); .section ".text" int a,b; .align 4 a = 5; b = 44; .global main swap(&a,&b); .type main,#function } .proc 020 void swap(x,y) main: int *x,*y; !#PROLOGUE# 0 { save %sp,-120,%sp int temp; !#PROLOGUE# 1 temp = *x; mov 5,%o0 ! a = 5 *x = *y; st %o0,[%fp-20] *y = temp; mov 44,%o0 ! b = 44 return; st %o0,[%fp-24] } add %fp,-20,%o0 ! pass parms add %fp,-24,%o1 ! in %o0,%o1 call swap,0 nop .LL1: ret restore .LLfe1: .size main,.LLfe1-main .align 4 .global swap frames not drawn .type swap,#function to scale .proc 020 swap: !#PROLOGUE# 0 sp+0 ___ save %sp,-120,%sp |...| !#PROLOGUE# 1 |___| st %i0,[%fp+68] ! save parms |___| st %i1,[%fp+72] ! in stack |...| ld [%fp+68],%o0 ! %o1 = *x |...| ld [%o0],%o1 fp-20|tmp| st %o1,[%fp-20] ! temp = *x |...| ld [%fp+68],%o0 ! %o0 = x sp+0 ___ fp+0 |___| ld [%fp+72],%o1 ! %o2 = *y |...| |...| ld [%o1],%o2 |___| |___| st %o2,[%o0] ! *x = *y |___| |___| ld [%fp+72],%o0 ! %o0 = y | | fp+68| x | %i0 ld [%fp-20],%o1 ! %o1 = temp | | fp+72| y | %i1 st %o1,[%o0] ! *y = temp |...| |...| b .LL2 fp-24| b | | b | nop fp-20| a | | a | .LL2: |...| |...| ret fp+0 |___| |___| restore after after .LLfe2: save in save in .size swap,.LLfe2-swap main swap .ident "GCC: (GNU) 2.8.1" generated assembly code for SPARC from gcc (optimization turned on) note use of registers to pass parameters, delay slot scheduling for control transfers (call and retl), and leaf optimizations in swap() .file "swap.c" gcc2_compiled.: .section ".text" .align 4 void main() { .global main void swap(); .type main,#function .proc 020 main: !#PROLOGUE# 0 int a,b; save %sp, -120, %sp !#PROLOGUE# 1 a = 5; mov 5, %o0 st %o0, [%fp-20] b = 44; mov 44, %o0 st %o0, [%fp-24] swap(&a,&b); add %fp, -20, %o0 call swap, 0 add %fp, -24, %o1 ! scheduled in delay slot } ret restore .align 4 void swap(x,y) .global swap int *x,*y; .type swap,#function { .proc 020 int temp; swap: temp = *x; ld [%o0], %g3 *x = *y; ld [%o1], %g2 *y = temp; st %g2, [%o0] return; retl } st %g3, [%o1] ! scheduled in delay slot