Upload
lynde
View
41
Download
2
Embed Size (px)
DESCRIPTION
嵌入式處理器架構與 程式設計. 王建民 中央研究院 資訊所 2008 年 7 月. Contents. Introduction Computer Architecture ARM Architecture Development Tools GNU Development Tools ARM Instruction Set ARM Assembly Language ARM Assembly Programming GNU ARM ToolChain Interrupts and Monitor. - PowerPoint PPT Presentation
Citation preview
嵌入式處理器架構與程式設計
王建民中央研究院 資訊所
2008 年 7 月
2
Contents Introduction Computer Architecture ARM Architecture Development Tools GNU Development Tools ARM Instruction Set ARM Assembly Language ARM Assembly Programming GNU ARM ToolChain Interrupts and Monitor
Lecture 8ARM Assembly Programming
4
Outline
Assembly Programming Assembly-C Interface Peephole Optimization
5
Example #4: String Length#include <stdio.h>extern int mystrlen(char *s);int main(){ char s[20] = “Hello, World!\n”; printf("The length of the string is %d\n", mystrlen(s));}
int mystrlen(char *s1){ char *s2; s2 = s1; while (*s2 != 0) { s2++; } return (s2-s1);}
6
Example #4: Pseudo Codeint mystrlen(char *s1)
{
char *s2;
s2 = s1;
while (*s2 != 0) {
s2++;
}
return (s2-s1);
}
mystrlen:
s2 = s1
start_loop:
if (*s2 == 0) goto end_loop
s2 = s2 + 1
goto start_loop
end_loop:
return (s2-s1)
7
Example #4: Storage Assignmentmystrlen:
s2 = s1
start_loop:
if (*s2 == 0) goto end_loop
s2 = s2 + 1
goto start_loop
end_loop:
return (s2-s1)
mystrlen:
r4 = r0
start_loop:
r5 = *r4
if (r5 == 0) goto end_loop
s4 = r4 + 1
goto start_loop
end_loop:
return (r4-r0)
8
Example #4: Final Assembly Code
mystrlen:
r4 = r0
start_loop:
r5 = *r4
if (r5 == 0) goto end_loop
r4 = r4 + 1
goto start_loop
end_loop:
return (r4-r0)
.text
.align 2
.global mystrlen
mystrlen:
mov r4, r0
start_loop:
ldrb r5, [r4]
cmp r5, #0
beq end_loop
add r4, r4, #1
b start_loop
end_loop:
sub r0, r4, r0
mov pc, lr
9
Example #5: Summation#include <stdio.h>extern int mysum(int n, int *array);int main(){ int a[5] = {1, 3, 5, 7, 9}; printf("The summation of the array is %d\n", mysum(5,a));}
int mysum(int n, int *array){ int i, sum; sum = 0; for (i = 0; i < n; i++) { sum += array[i]; } return sum;}
10
Example #5: Pseudo Codeint mysum(int n, int *array)
{
int i, sum;
sum = 0;
for (i = 0; i < n; i++) {
sum += array[i];
}
return sum;
}
mysum:
sum = 0
i = 0
start_loop:
if (i >= n) goto end_loop
sum = sum + array[i]
i = i + 1
goto start_loop
end_loop:
return sum
11
Example #5: Storage Assignmentmysum:
sum = 0
i = 0
start_loop:
if (i >= n) goto end_loop
sum = sum + array[i]
i = i + 1
goto start_loop
end_loop:
return sum
mysum:
r5 = 0
r4 = 0
start_loop:
if (r4 >= r0) goto end_loop
r6 = r1[r4]
r5 = r5 + r6
r4 = r4 + 1
goto start_loop
end_loop:
return r5
12
Example #5: Final Assembly Code
mysum: r5 = 0 r4 = 0start_loop: if (r4 >= r0) goto end_loop
r6 = r1[r4] r5 = r5 + r6 r4 = r4 + 1 goto start_loopend_loop: return r5
.text .align 2 .global mysummysum: mov r5, #0 mov r4, #0start_loop: cmp r4, r0 bge end_loop ldr r6, [r1,r4,LSL#2] add r5, r5, r6 add r4, r4, #1 b start_loopend_loop: mov r0, r5 mov pc, lr
13
Example #6: Bubble Sort1
#include <stdio.h>
extern void bubble(int n, int *a);
int main()
{
int i;
int a[5] = {9, 7, 5, 3, 1};
bubble(5, a);
printf("The sorted array:\n");
for (i = 0; i < 5; i++) {
printf("a[%d] = %d\n", i, a[i]);
}
}
14
Example #6: Bubble Sort2
void sort2(int *a, int *b){ int tmp; if (*b < *a) { tmp = *a; *a = *b; *b = tmp; }}
void bubble(int n, int *a){ int i, j; for (i = 0; i < n-1; i++) { for (j = 0; j < n-1-i; j++) { sort2(&a[j], &a[j+1]); } }}
15
Example #6: Pseudo Codevoid bubble(int n, int *a);{ int i, j; for (i = 0; i < n-1; i++) {
for (j = 0; j < n-1-i; j++) {
sort2(&a[j], &a[j+1]); }
}
}
bubble:
i = 0start_outer: if (i >= n-1) goto end_outer j = 0start_inner: if (j >= n-1-i) goto end_inner sort2(&a[j],&a[j+1]) j = j + 1 goto start_innerend_inner: i = i + 1 goto start_outerend_outer: return
16
Example #6: Storage Assignmentbubble: i = 0start_outer: if (i >= n-1) goto end_outer
j = 0start_inner: if (j >= n-1-i) goto end_inner
sort2(&a[j],&a[j+1]) j = j + 1 goto start_innerend_inner: i = i + 1 goto start_outerend_outer: return
bubble: r2 = 0start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer r3 = 0start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner sort2(r1+r3*4,r1+r3*4+4) r3 = r3 + 1 goto start_innerend_inner: r2 = r2 + 1 goto start_outerend_outer: return
17
Example #6: Assembly Code?bubble: r2 = 0start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer
r3 = 0start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner
sort2(r1+r3*4,r1+r3*4+4)
r3 = r3 + 1 goto start_innerend_inner: r2 = r2 + 1 goto start_outerend_outer: return
bubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr
18
Example #6: Final Assembly Codebubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner
add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2
add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr
bubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner stmfd sp!,{r0-r3,lr} add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 ldmfd sp, {r0-r3,lr} add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr
19
Outline
Assembly Programming Assembly-C Interface Peephole Optimization
20
Generating Assembly Code from C
In this course, we will be using the GNU ARM ToolChain.
To compile a C program to assembly code arm-elf-gcc –S filename.c When you compile a .c file, you get a .s file This .s file contains the assembly language code
When assembled, this code can potentially be linked and loaded as an executable
To display information from an object file arm-elf-objdump –S –r filename
21
Example #7: A Simple Programint a, b;
int main()
{
a = 3;
b = 4;
} /* end main() */
.file "example4.c" .text .align 2 .global main .type main, %functionmain: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r2, .L3 mov r3, #3 str r3, [r2, #0] ldr r2, .L3+4 mov r3, #4 str r3, [r2, #0] ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word a .word b .size main, .-main .comm a,4,4 .comm b,4,4 .ident "GCC: (GNU) 4.0.0"Declare storage for a and b
Loader will put addresses of a and b in this memory location
22
Example #7: Object Fileexample1.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <main>: 0: e1a0c00d mov ip, sp 4: e92dd800 stmdb sp!, {fp, ip, lr, pc} 8: e24cb004 sub fp, ip, #4 ; 0x4 c: e59f2014 ldr r2, [pc, #20] ; 28 <.text+0x28> 10: e3a03003 mov r3, #3 ; 0x3 14: e5823000 str r3, [r2] 18: e59f200c ldr r2, [pc, #12] ; 2c <.text+0x2c> 1c: e3a03004 mov r3, #4 ; 0x4 20: e5823000 str r3, [r2] 24: e89da800 ldmia sp, {fp, sp, pc}
...28: R_ARM_ABS32 a2c: R_ARM_ABS32 b
23
Example #7: Executable File00008208 <main>:
8208: e1a0c00d mov ip, sp
820c: e92dd800 stmdb sp!, {fp, ip, lr, pc}
8210: e24cb004 sub fp, ip, #4 ; 0x4
8214: e59f2014 ldr r2, [pc, #20] ; 8230 <.text+0x210>
8218: e3a03003 mov r3, #3 ; 0x3
821c: e5823000 str r3, [r2]
8220: e59f200c ldr r2, [pc, #12] ; 8234 <.text+0x214>
8224: e3a03004 mov r3, #4 ; 0x4
8228: e5823000 str r3, [r2]
822c: e89da800 ldmia sp, {fp, sp, pc}
8230: 0000adc4 andeq sl, r0, r4, asr #27
8234: 0000adc0 andeq sl, r0, r0, asr #27
24
Example #8: Calling A Functionint tmp; void swap(int a, int b);int main() {
int a, b; a = 3; b = 4; swap(a, b);
} /* end main() */
void swap(int a, int b) {
tmp = a; a = b; b = tmp;
} /* end swap() */
25
Example #8: Assembly Listingmain: mov ip, sp stmfd sp!, {fp, ip, lr,
pc} sub fp, ip, #4 sub sp, sp, #8 mov r3, #3 str r3, [fp, #-20] mov r3, #4 str r3, [fp, #-16] ldr r0, [fp, #-20] ldr r1, [fp, #-16] bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc}
swap: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #8 str r0, [fp, #-16] str r1, [fp, #-20] ldr r2, .L5 ldr r3, [fp, #-16] str r3, [r2, #0] ldr r3, [fp, #-20] str r3, [fp, #-16] ldr r3, .L5 ldr r3, [r3, #0] str r3, [fp, #-20] sub sp, fp, #12 ldmfd sp, {fp, sp, pc}.L6: .align 2.L5: .word tmp .comm tmp,4,4
26
Example #9: Manipulating Pointersint tmp; int *pa, *pb; void swap(int a, int b); int main() { int a, b; pa = &a; pb = &b; *pa = 3; *pb = 4; swap(*pa, *pb); } /* end main() */
void swap(int a, int b) { tmp = a; a = b; b = tmp; } /* end swap() */
27
Example #9: Assembly Listingmain: mov ip, sp stmfd sp!, {fp, ip, lr,
pc} sub fp, ip, #4 sub sp, sp, #8 ldr r2, .L3 sub r3, fp, #16 str r3, [r2, #0] ldr r2, .L3+4 sub r3, fp, #20 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #3 str r3, [r2, #0] ldr r3, .L3+4 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0]
ldr r3, .L3 ldr r3, [r3, #0] ldr r2, [r3, #0] ldr r3, .L3+4 ldr r3, [r3, #0] ldr r3, [r3, #0] mov r0, r2 mov r1, r3 bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word pa .word pb
28
Example #10: Dealing with structtypedef struct testStruct
{
unsigned int a;
unsigned int b;
char c;
} testStruct;
testStruct *ptest;
int main()
{
ptest >a = 4;
ptest >b = 10;
ptest >c = 'A';
} /* end main() */
main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r3, .L3 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #10 str r3, [r2, #4] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #65 strb r3, [r2, #8] ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word ptest
29
Example #11: Passing Argumentsint tmp; void test(int a, int b, int c, int d, int *e); int main() { int a, b, c, d, e; a = 3; b = 4; c = 5; d = 6; e = 7; test(a, b, c, d, &e); } /* end main() */
void test(int a, int b, int c, int d, int *e) { tmp = a; a = b; b = tmp; c = b; b = d; *e = d; } /* end test() */
30
Example #11: Assembly Listing1
main:
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #24
mov r3, #3
str r3, [fp, #-28]
mov r3, #4
str r3, [fp, #-24]
mov r3, #5
str r3, [fp, #-20]
mov r3, #6
str r3, [fp, #-16]
mov r3, #7
str r3, [fp, #-32]
sub r3, fp, #32
str r3, [sp, #0]
ldr r0, [fp, #-28]
ldr r1, [fp, #-24]
ldr r2, [fp, #-20]
ldr r3, [fp, #-16]
bl test
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
31
Example #11: Assembly Listing2
test:
mov ip, sp
stmfd sp!, {fp, ip, lr, pc}
sub fp, ip, #4
sub sp, sp, #16
str r0, [fp, #-16]
str r1, [fp, #-20]
str r2, [fp, #-24]
str r3, [fp, #-28]
ldr r2, .L5
ldr r3, [fp, #-16]
str r3, [r2, #0]
ldr r3, [fp, #-20]
str r3, [fp, #-16]
ldr r3, .L5
ldr r3, [r3, #0]
str r3, [fp, #-20]
ldr r3, [fp, #-20]
str r3, [fp, #-24]
ldr r3, [fp, #-28]
str r3, [fp, #-20]
ldr r2, [fp, #4]
ldr r3, [fp, #-28]
str r3, [r2, #0]
sub sp, fp, #12
ldmfd sp, {fp, sp, pc}
.L6:
.align 2
.L5:
.word tmp
32
Interfacing C and Assembly ARM has developed a standard called the “ARM
Procedure Call Standard” (APCS) which defines: constraints on the use of registers stack conventions format of a stack backtrace data structure argument passing and result return support for ARM shared library mechanism
Compiler generated code conforms to the APCS It's just a standard not an architectural requirement Cannot avoid standard when interfacing C and
assembly code Can avoid standard when just writing assembly code or
when writing assembly code that isn't called by C code
33
Register Names and UseRegister # APCS Name APCS Role
R0 a1 argument 1
R1 a2 argument 2
R2 a3 argument 3
R3 a4 argument 4
R4..R8 v1..v5 register variables
R9 sb/v6 static base/register variable
R10 sl/v7 stack limit/register variable
R11 fp frame pointer
R12 ip scratch reg/new sb in inter link unit calls
R13 sp low end of current stack frame
R14 lr link address/scratch register
R15 pc program counter
34
How Does STM Work on Memory ?
STM sp!, {r0 r15} The ARM processor
uses a bit-vector to represent each register to be saved
The architecture places the lowest number register into the lowest address
Default STM == STMDB == STMFD
pclrsp
SPbefore
address0x900x8c0x880x840x800x7c0x780x740x700x6c0x680x640x600x5c0x580x540x50
ipfpv7v6v5v4v3v2v1a4a3a2a1SPafter
35
Passing and Returning Structures
Structures are usually passed in registers (and overflow onto the stack when necessary)
When a function returns a struct, a pointer to where the struct result is to be placed is passed in a1 (first argument)
Example struct s f(int x);
is compiled as
void f(struct s *result, int x);
36
Example #12: Passing Structurestypedef struct two_ch_struct{
char ch1;
char ch2;
} two_ch;
two_ch max(two_ch a, two_ch b){
return((a.ch1 > b.ch1)?a:b);
} /* end max() */
max: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #12 str r0, [fp, #-24] str r1, [fp, #-16] str r2, [fp, #-20] ldrb r2, [fp, #-16] ldrb r3, [fp, #-20] cmp r2, r3 bls .L2 ldr r3, [fp, #-16] ldr r2, [fp, #-24] str r3, [r2, #0] b .L1.L2: ldr r3, [fp, #-20] ldr r2, [fp, #-24] str r3, [r2, #0].L1: ldr r0, [fp, #-24] sub sp, fp, #12 ldmfd sp, {fp, sp, pc}
37
The Frame Pointer• Frame pointer (fp) points to the top
of stack for function By using the frame pointer and
storing it at the same offset for every function call, it creates a singly linked list of activation records
foo: mov ip,sp stmfd sp!,{a1 a3,fp,ip,lr,pc} sub fp,ip,#4 <computations go here> sub fp,fp,#12 ldmfd fp,{fp,sp,pc}
pc
lr
ip
fp
address0x900x8c0x880x840x800x7c0x780x740x70
fp
a3
a2
a1
ip
sp
38
Backtrace
The fp register points to the stack backtrace structure for the currently executing function.
The saved fp value is (zero or) a pointer to a stack backtrace structure created by the function which called the current function.
The saved fp value in this structure is a pointer to the stack backtrace structure for the function that called the function that called the current function; and so on back until the first function.
39
Creating the “Backtrace” Structure
MOV ip, sp
STMFD sp!,{a1 a4,v1 v7,fp,ip,sp,lr,pc}
SUB fp, ip, #4
…
…
sub fp, fp, #16
LDMFD fp, {fp,sp,sb,pc}
SPbefore
address0x900x8c0x880x840x800x7c0x780x740x700x6c0x680x640x600x5c0x580x540x50
SPcurrent
FPafter (saved) pc(saved) lr(saved) sp(saved) ip(saved) fp
v7v6v5v4v3v2v1a4a3a2a1
IPcurrent
40
Example Backtrace
(saved) pc(saved) lr(saved) sp(saved) ip(saved) fp
v7v6v5v4v3v2v1a4a3a2a1
(saved) pc(saved) lr(saved) sp(saved) ip(saved) fp
v7v6v5v4v3v2v1a4a3a2a1
(saved) pc(saved) lr(saved) sp(saved) ip(saved) fp
v7v6v5v4v3v2v1a4a3a2a1
fp
bar’s framefoo’s frame
main’s frame
41
Exercise #1
Write an assembly subroutine that implements the quicksort algorithm to sort a list of unsigned integer values. The first entry in the list is the list’s length. void quickSort(unsigned int *list);
Input Output
list: 0x00000005 0x00000005
0xA356A101 0x09250037
0xE235C203 0x29567322
0x7A35B310 0x7A35B310
0x09250037 0xA356A101
0x29567322 0xE235C203
42
Exercise #2 Write an assembly subroutine that deletes an item
from an ordered list of unsigned values if it is not already there. The first entry in the list is the list’s length. void removeItem(unsigned int item, unsigned int *list);
Input Outputitem: 0x7A35B310list: 0x00000005 0x00000004
0x09250037 0x092500370x29567322 0x295673220x7A35B310 0xA356A1010xA356A101 0xE235C2030xE235C203
43
Outline
Assembly Programming Assembly-C Interface Peephole Optimization
44
Peephole Optimization
Final pass over generated code: Examine a few consecutive instructions: 2 to 4
See if an obvious replacement is possible: store/load pairs
MOV %eax => mema
MOV mema => %eax Can eliminate the second instruction without needing
any global knowledge of mema Use algebraic identities Special-case individual instructions
45
Algebraic Identities
Worth recognizing single instructions with a constant operand: A * 2 = A + A A * 1 = A A * 0 = 0 A / 1 = A
More delicate with floating-point
46
Is this ever helpful?
Why would anyone write X * 1? Why bother to correct such obvious junk
code? In fact one might write
#define MAX_TASKS 1...a = b * MAX_TASKS;
Also, seemingly redundant code can be produced by other optimizations. This is an important effect.
47
Replace Multiply by Shift
A := A * 4; Can be replaced by 2-bit left shift
(signed/unsigned) But must worry about overflow if language
does A := A / 4;
If unsigned, can replace with shift right But shift right arithmetic is a well-known
problem Language may allow it anyway (traditional C)
48
Addition Chains for Multiplication
If multiply is very slow (or on a machine with no multiply instruction like the original SPARC), decomposing a constant operand into sum of powers of two can be effective: X * 125 = x * 128 – x * 4 + x Two shifts, one subtract and one add, which
may be faster than one multiply Note similarity with efficient exponentiation
method
49
The Right Shift Problem
Arithmetic Right shift: Shift right and use sign bit to fill most
significant bits -5 111111...1111111011 SAR 111111...1111111101 Which is -3, not -2 In most languages -5/2 = -2 Prior to C99, implementations were allowed to
truncate towards or away from zero if either operand was negative
50
Folding Jumps to Jumps
A jump to an unconditional jump can copy the target address
JNE lab1
...
lab1 JMP lab2 Can be replaced by
JNE lab2 As a result, lab1 may become dead
(unreferenced)
51
Jump to Return
A jump to a return can be replaced by a return
JMP lab1...
lab1 RET Can be replaced by
RET lab1 may become dead code
52
Tail Recursion Elimination1
A subprogram is tail-recursive if the last computation is a call to itself:
function last (lis : list_type) return lis_type is
begin
if lis.next = null then return lis;
else return last (lis.next);
end; Recursive call can be replaced with lis := lis.next;
goto start; -- added label
53
Tail Recursion Elimination2
Saves time: an assignment and jump is faster than a call with one parameter
Saves stack space: converts linear stack usage to constant usage.
In languages with no loops, this may be a required optimization: specified in Scheme standard.
54
Tail Recursion Elimination3
Consider the sequence on the x86:CALL funcRET
CALL pushes return point on stack, RET in body of func removes it, RET in caller returns
Can generate instead:JMP func
Now RET in func returns to original caller, because single return address on stack
55
The REALIA COBOL Compiler1
Full compiler for Standard COBOL, targeted to the IBM PC.
Now distributed by Computer Associates Runs in 150K bytes, but must be able to
handle very large programs that run on mainframes
56
The REALIA COBOL Compiler2
No global optimization possible: multiple linear passes over code, no global data structures, no flow graph.
Multiple peephole optimizations, compiler iterates until code is stable. Each pass scan code backwards to minimize address recomputations
57
Typical COBOL Code
Process-Balance. if Balance is negative then perform Send-Bill else perform Record-Credit end-if.Send-Bill. ...Record-Credit. ...
58
Simple Assembly
Pb: cmp balance, 0
jnl L1
call Sb
jmp L2 -- jump to return
L1: call Rc
L2: ret
Sb: …
ret
Rc: …
ret
59
Fold Jump to Return Statement
Pb: cmp balance, 0
jnl L1
call Sb -- tail recursion
ret -- folded
L1: call Rc -- tail recursion
L2: ret
Sb: …
ret
Rc: …
ret
60
Eliminate Tail Recursion
Pb: cmp balance, 0
jnl L1 -- jump to unconditional jump
imp Sb
ret
L1: jmp Rc -- will become useless
L2: ret
Sb: …
ret
Rc: …
ret
61
Corresponding Assembly
Pb: cmp balance, 0
jnl Rc -- folded
jmp Sb
ret -- unreachable
L1: jmp Rc -- unreachable
L2: ret -- unreachable
Sb: …
ret
Rc: …
ret
62
Remove Dead Code
Pb: cmp balance, 0
jnl Rc
jmp Sb -- jump to next instruction
Sb: …
ret
Rc: …
ret
63
Final Code
Pb: cmp balance, 0
jnl Rc
Sb: …
ret
Rc: …
ret Final code as efficient as inlining. All transformations are local. Each optimization
may yield further optimization opportunities. Iterate till no further change.
64
Arcane Tricks
Consider typical maximum computationif A >= B then
C := A;
else
C := B;
end if; For simplicity assume all unsigned, and all in
registers
65
Eliminating Max Jump on x86
Simple-minded assembly codeCMP A, B
JNAE L1
MOVA=>C
JMP L2
L1: MOVB=>C
L2: One jump in either case
66
Computing Max without Jumps Architecture-specific trick: use subtract with borrow
instruction and carry flag CMP A, B ; CF=1 if B > A, CF = 0 if A >= B
SBB %eax, %eax ; all 1's if B > A, all 0's if A >= BMOV %eax, CNOT C ; all 0's if B > A, all 1's if A >= BAND B=>%eax ; B if B>A, 0 if A>=BAND A=>C ; 0 if B >A, A if A>=BOR %eax=>C ; B if B>A, A if A>=B
More instructions, but NO JUMPS Supercompiler: exhaustive search of instruction patterns to
uncover similar tricks