66
嵌嵌嵌嵌嵌嵌嵌嵌嵌嵌嵌嵌嵌 嵌嵌嵌 嵌嵌嵌嵌嵌 嵌嵌嵌 2008 嵌 7 嵌

嵌入式處理器架構與 程式設計

  • Upload
    lynde

  • View
    41

  • Download
    2

Embed Size (px)

DESCRIPTION

嵌入式處理器架構與 程式設計. 王建民 中央研究院 資訊所 2008 年 7 月. Contents. Introduction Computer Architecture ARM Architecture Development Tools GNU Development Tools ARM Instruction Set ARM Assembly Language ARM Assembly Programming GNU ARM ToolChain Interrupts and Monitor. - PowerPoint PPT Presentation

Citation preview

Page 1: 嵌入式處理器架構與 程式設計

嵌入式處理器架構與程式設計

王建民中央研究院 資訊所

2008 年 7 月

Page 2: 嵌入式處理器架構與 程式設計

2

Contents Introduction Computer Architecture ARM Architecture Development Tools GNU Development Tools ARM Instruction Set ARM Assembly Language ARM Assembly Programming GNU ARM ToolChain Interrupts and Monitor

Page 3: 嵌入式處理器架構與 程式設計

Lecture 8ARM Assembly Programming

Page 4: 嵌入式處理器架構與 程式設計

4

Outline

Assembly Programming Assembly-C Interface Peephole Optimization

Page 5: 嵌入式處理器架構與 程式設計

5

Example #4: String Length#include <stdio.h>extern int mystrlen(char *s);int main(){ char s[20] = “Hello, World!\n”; printf("The length of the string is %d\n", mystrlen(s));}

int mystrlen(char *s1){ char *s2; s2 = s1; while (*s2 != 0) { s2++; } return (s2-s1);}

Page 6: 嵌入式處理器架構與 程式設計

6

Example #4: Pseudo Codeint mystrlen(char *s1)

{

char *s2;

s2 = s1;

while (*s2 != 0) {

s2++;

}

return (s2-s1);

}

mystrlen:

s2 = s1

start_loop:

if (*s2 == 0) goto end_loop

s2 = s2 + 1

goto start_loop

end_loop:

return (s2-s1)

Page 7: 嵌入式處理器架構與 程式設計

7

Example #4: Storage Assignmentmystrlen:

s2 = s1

start_loop:

if (*s2 == 0) goto end_loop

s2 = s2 + 1

goto start_loop

end_loop:

return (s2-s1)

mystrlen:

r4 = r0

start_loop:

r5 = *r4

if (r5 == 0) goto end_loop

s4 = r4 + 1

goto start_loop

end_loop:

return (r4-r0)

Page 8: 嵌入式處理器架構與 程式設計

8

Example #4: Final Assembly Code

mystrlen:

r4 = r0

start_loop:

r5 = *r4

if (r5 == 0) goto end_loop

r4 = r4 + 1

goto start_loop

end_loop:

return (r4-r0)

.text

.align 2

.global mystrlen

mystrlen:

mov r4, r0

start_loop:

ldrb r5, [r4]

cmp r5, #0

beq end_loop

add r4, r4, #1

b start_loop

end_loop:

sub r0, r4, r0

mov pc, lr

Page 9: 嵌入式處理器架構與 程式設計

9

Example #5: Summation#include <stdio.h>extern int mysum(int n, int *array);int main(){ int a[5] = {1, 3, 5, 7, 9}; printf("The summation of the array is %d\n", mysum(5,a));}

int mysum(int n, int *array){ int i, sum; sum = 0; for (i = 0; i < n; i++) { sum += array[i]; } return sum;}

Page 10: 嵌入式處理器架構與 程式設計

10

Example #5: Pseudo Codeint mysum(int n, int *array)

{

int i, sum;

sum = 0;

for (i = 0; i < n; i++) {

sum += array[i];

}

return sum;

}

mysum:

sum = 0

i = 0

start_loop:

if (i >= n) goto end_loop

sum = sum + array[i]

i = i + 1

goto start_loop

end_loop:

return sum

Page 11: 嵌入式處理器架構與 程式設計

11

Example #5: Storage Assignmentmysum:

sum = 0

i = 0

start_loop:

if (i >= n) goto end_loop

sum = sum + array[i]

i = i + 1

goto start_loop

end_loop:

return sum

mysum:

r5 = 0

r4 = 0

start_loop:

if (r4 >= r0) goto end_loop

r6 = r1[r4]

r5 = r5 + r6

r4 = r4 + 1

goto start_loop

end_loop:

return r5

Page 12: 嵌入式處理器架構與 程式設計

12

Example #5: Final Assembly Code

mysum: r5 = 0 r4 = 0start_loop: if (r4 >= r0) goto end_loop

r6 = r1[r4] r5 = r5 + r6 r4 = r4 + 1 goto start_loopend_loop: return r5

.text .align 2 .global mysummysum: mov r5, #0 mov r4, #0start_loop: cmp r4, r0 bge end_loop ldr r6, [r1,r4,LSL#2] add r5, r5, r6 add r4, r4, #1 b start_loopend_loop: mov r0, r5 mov pc, lr

Page 13: 嵌入式處理器架構與 程式設計

13

Example #6: Bubble Sort1

#include <stdio.h>

extern void bubble(int n, int *a);

int main()

{

int i;

int a[5] = {9, 7, 5, 3, 1};

bubble(5, a);

printf("The sorted array:\n");

for (i = 0; i < 5; i++) {

printf("a[%d] = %d\n", i, a[i]);

}

}

Page 14: 嵌入式處理器架構與 程式設計

14

Example #6: Bubble Sort2

void sort2(int *a, int *b){ int tmp; if (*b < *a) { tmp = *a; *a = *b; *b = tmp; }}

void bubble(int n, int *a){ int i, j; for (i = 0; i < n-1; i++) { for (j = 0; j < n-1-i; j++) { sort2(&a[j], &a[j+1]); } }}

Page 15: 嵌入式處理器架構與 程式設計

15

Example #6: Pseudo Codevoid bubble(int n, int *a);{ int i, j; for (i = 0; i < n-1; i++) {

for (j = 0; j < n-1-i; j++) {

sort2(&a[j], &a[j+1]); }

}

}

bubble:

i = 0start_outer: if (i >= n-1) goto end_outer j = 0start_inner: if (j >= n-1-i) goto end_inner sort2(&a[j],&a[j+1]) j = j + 1 goto start_innerend_inner: i = i + 1 goto start_outerend_outer: return

Page 16: 嵌入式處理器架構與 程式設計

16

Example #6: Storage Assignmentbubble: i = 0start_outer: if (i >= n-1) goto end_outer

j = 0start_inner: if (j >= n-1-i) goto end_inner

sort2(&a[j],&a[j+1]) j = j + 1 goto start_innerend_inner: i = i + 1 goto start_outerend_outer: return

bubble: r2 = 0start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer r3 = 0start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner sort2(r1+r3*4,r1+r3*4+4) r3 = r3 + 1 goto start_innerend_inner: r2 = r2 + 1 goto start_outerend_outer: return

Page 17: 嵌入式處理器架構與 程式設計

17

Example #6: Assembly Code?bubble: r2 = 0start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer

r3 = 0start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner

sort2(r1+r3*4,r1+r3*4+4)

r3 = r3 + 1 goto start_innerend_inner: r2 = r2 + 1 goto start_outerend_outer: return

bubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr

Page 18: 嵌入式處理器架構與 程式設計

18

Example #6: Final Assembly Codebubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner

add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2

add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr

bubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner stmfd sp!,{r0-r3,lr} add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 ldmfd sp, {r0-r3,lr} add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr

Page 19: 嵌入式處理器架構與 程式設計

19

Outline

Assembly Programming Assembly-C Interface Peephole Optimization

Page 20: 嵌入式處理器架構與 程式設計

20

Generating Assembly Code from C

In this course, we will be using the GNU ARM ToolChain.

To compile a C program to assembly code arm-elf-gcc –S filename.c When you compile a .c file, you get a .s file This .s file contains the assembly language code

When assembled, this code can potentially be linked and loaded as an executable

To display information from an object file arm-elf-objdump –S –r filename

Page 21: 嵌入式處理器架構與 程式設計

21

Example #7: A Simple Programint a, b;

int main()

{

a = 3;

b = 4;

} /* end main() */

.file "example4.c" .text .align 2 .global main .type main, %functionmain: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r2, .L3 mov r3, #3 str r3, [r2, #0] ldr r2, .L3+4 mov r3, #4 str r3, [r2, #0] ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word a .word b .size main, .-main .comm a,4,4 .comm b,4,4 .ident "GCC: (GNU) 4.0.0"Declare storage for a and b

Loader will put addresses of a and b in this memory location

Page 22: 嵌入式處理器架構與 程式設計

22

Example #7: Object Fileexample1.o: file format elf32-littlearm

Disassembly of section .text:

00000000 <main>: 0: e1a0c00d mov ip, sp 4: e92dd800 stmdb sp!, {fp, ip, lr, pc} 8: e24cb004 sub fp, ip, #4 ; 0x4 c: e59f2014 ldr r2, [pc, #20] ; 28 <.text+0x28> 10: e3a03003 mov r3, #3 ; 0x3 14: e5823000 str r3, [r2] 18: e59f200c ldr r2, [pc, #12] ; 2c <.text+0x2c> 1c: e3a03004 mov r3, #4 ; 0x4 20: e5823000 str r3, [r2] 24: e89da800 ldmia sp, {fp, sp, pc}

...28: R_ARM_ABS32 a2c: R_ARM_ABS32 b

Page 23: 嵌入式處理器架構與 程式設計

23

Example #7: Executable File00008208 <main>:

8208: e1a0c00d mov ip, sp

820c: e92dd800 stmdb sp!, {fp, ip, lr, pc}

8210: e24cb004 sub fp, ip, #4 ; 0x4

8214: e59f2014 ldr r2, [pc, #20] ; 8230 <.text+0x210>

8218: e3a03003 mov r3, #3 ; 0x3

821c: e5823000 str r3, [r2]

8220: e59f200c ldr r2, [pc, #12] ; 8234 <.text+0x214>

8224: e3a03004 mov r3, #4 ; 0x4

8228: e5823000 str r3, [r2]

822c: e89da800 ldmia sp, {fp, sp, pc}

8230: 0000adc4 andeq sl, r0, r4, asr #27

8234: 0000adc0 andeq sl, r0, r0, asr #27

Page 24: 嵌入式處理器架構與 程式設計

24

Example #8: Calling A Functionint tmp; void swap(int a, int b);int main() {

int a, b; a = 3; b = 4; swap(a, b);

} /* end main() */

void swap(int a, int b) {

tmp = a; a = b; b = tmp;

} /* end swap() */

Page 25: 嵌入式處理器架構與 程式設計

25

Example #8: Assembly Listingmain: mov ip, sp stmfd sp!, {fp, ip, lr,

pc} sub fp, ip, #4 sub sp, sp, #8 mov r3, #3 str r3, [fp, #-20] mov r3, #4 str r3, [fp, #-16] ldr r0, [fp, #-20] ldr r1, [fp, #-16] bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc}

swap: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #8 str r0, [fp, #-16] str r1, [fp, #-20] ldr r2, .L5 ldr r3, [fp, #-16] str r3, [r2, #0] ldr r3, [fp, #-20] str r3, [fp, #-16] ldr r3, .L5 ldr r3, [r3, #0] str r3, [fp, #-20] sub sp, fp, #12 ldmfd sp, {fp, sp, pc}.L6: .align 2.L5: .word tmp .comm tmp,4,4

Page 26: 嵌入式處理器架構與 程式設計

26

Example #9: Manipulating Pointersint tmp; int *pa, *pb; void swap(int a, int b); int main() { int a, b; pa = &a; pb = &b; *pa = 3; *pb = 4; swap(*pa, *pb); } /* end main() */

void swap(int a, int b) { tmp = a; a = b; b = tmp; } /* end swap() */

Page 27: 嵌入式處理器架構與 程式設計

27

Example #9: Assembly Listingmain: mov ip, sp stmfd sp!, {fp, ip, lr,

pc} sub fp, ip, #4 sub sp, sp, #8 ldr r2, .L3 sub r3, fp, #16 str r3, [r2, #0] ldr r2, .L3+4 sub r3, fp, #20 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #3 str r3, [r2, #0] ldr r3, .L3+4 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0]

ldr r3, .L3 ldr r3, [r3, #0] ldr r2, [r3, #0] ldr r3, .L3+4 ldr r3, [r3, #0] ldr r3, [r3, #0] mov r0, r2 mov r1, r3 bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word pa .word pb

Page 28: 嵌入式處理器架構與 程式設計

28

Example #10: Dealing with structtypedef struct testStruct

{

unsigned int a;

unsigned int b;

char c;

} testStruct;

testStruct *ptest;

int main()

{

ptest >a = 4;

ptest >b = 10;

ptest >c = 'A';

} /* end main() */

main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r3, .L3 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #10 str r3, [r2, #4] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #65 strb r3, [r2, #8] ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word ptest

Page 29: 嵌入式處理器架構與 程式設計

29

Example #11: Passing Argumentsint tmp; void test(int a, int b, int c, int d, int *e); int main() { int a, b, c, d, e; a = 3; b = 4; c = 5; d = 6; e = 7; test(a, b, c, d, &e); } /* end main() */

void test(int a, int b, int c, int d, int *e) { tmp = a; a = b; b = tmp; c = b; b = d; *e = d; } /* end test() */

Page 30: 嵌入式處理器架構與 程式設計

30

Example #11: Assembly Listing1

main:

mov ip, sp

stmfd sp!, {fp, ip, lr, pc}

sub fp, ip, #4

sub sp, sp, #24

mov r3, #3

str r3, [fp, #-28]

mov r3, #4

str r3, [fp, #-24]

mov r3, #5

str r3, [fp, #-20]

mov r3, #6

str r3, [fp, #-16]

mov r3, #7

str r3, [fp, #-32]

sub r3, fp, #32

str r3, [sp, #0]

ldr r0, [fp, #-28]

ldr r1, [fp, #-24]

ldr r2, [fp, #-20]

ldr r3, [fp, #-16]

bl test

sub sp, fp, #12

ldmfd sp, {fp, sp, pc}

Page 31: 嵌入式處理器架構與 程式設計

31

Example #11: Assembly Listing2

test:

mov ip, sp

stmfd sp!, {fp, ip, lr, pc}

sub fp, ip, #4

sub sp, sp, #16

str r0, [fp, #-16]

str r1, [fp, #-20]

str r2, [fp, #-24]

str r3, [fp, #-28]

ldr r2, .L5

ldr r3, [fp, #-16]

str r3, [r2, #0]

ldr r3, [fp, #-20]

str r3, [fp, #-16]

ldr r3, .L5

ldr r3, [r3, #0]

str r3, [fp, #-20]

ldr r3, [fp, #-20]

str r3, [fp, #-24]

ldr r3, [fp, #-28]

str r3, [fp, #-20]

ldr r2, [fp, #4]

ldr r3, [fp, #-28]

str r3, [r2, #0]

sub sp, fp, #12

ldmfd sp, {fp, sp, pc}

.L6:

.align 2

.L5:

.word tmp

Page 32: 嵌入式處理器架構與 程式設計

32

Interfacing C and Assembly ARM has developed a standard called the “ARM

Procedure Call Standard” (APCS) which defines: constraints on the use of registers stack conventions format of a stack backtrace data structure argument passing and result return support for ARM shared library mechanism

Compiler generated code conforms to the APCS It's just a standard not an architectural requirement Cannot avoid standard when interfacing C and

assembly code Can avoid standard when just writing assembly code or

when writing assembly code that isn't called by C code

Page 33: 嵌入式處理器架構與 程式設計

33

Register Names and UseRegister # APCS Name APCS Role

R0 a1 argument 1

R1 a2 argument 2

R2 a3 argument 3

R3 a4 argument 4

R4..R8 v1..v5 register variables

R9 sb/v6 static base/register variable

R10 sl/v7 stack limit/register variable

R11 fp frame pointer

R12 ip scratch reg/new sb in inter link unit calls

R13 sp low end of current stack frame

R14 lr link address/scratch register

R15 pc program counter

Page 34: 嵌入式處理器架構與 程式設計

34

How Does STM Work on Memory ?

STM sp!, {r0 r15} The ARM processor

uses a bit-vector to represent each register to be saved

The architecture places the lowest number register into the lowest address

Default STM == STMDB == STMFD

pclrsp

SPbefore

address0x900x8c0x880x840x800x7c0x780x740x700x6c0x680x640x600x5c0x580x540x50

ipfpv7v6v5v4v3v2v1a4a3a2a1SPafter

Page 35: 嵌入式處理器架構與 程式設計

35

Passing and Returning Structures

Structures are usually passed in registers (and overflow onto the stack when necessary)

When a function returns a struct, a pointer to where the struct result is to be placed is passed in a1 (first argument)

Example struct s f(int x);

is compiled as

void f(struct s *result, int x);

Page 36: 嵌入式處理器架構與 程式設計

36

Example #12: Passing Structurestypedef struct two_ch_struct{

char ch1;

char ch2;

} two_ch;

two_ch max(two_ch a, two_ch b){

return((a.ch1 > b.ch1)?a:b);

} /* end max() */

max: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #12 str r0, [fp, #-24] str r1, [fp, #-16] str r2, [fp, #-20] ldrb r2, [fp, #-16] ldrb r3, [fp, #-20] cmp r2, r3 bls .L2 ldr r3, [fp, #-16] ldr r2, [fp, #-24] str r3, [r2, #0] b .L1.L2: ldr r3, [fp, #-20] ldr r2, [fp, #-24] str r3, [r2, #0].L1: ldr r0, [fp, #-24] sub sp, fp, #12 ldmfd sp, {fp, sp, pc}

Page 37: 嵌入式處理器架構與 程式設計

37

The Frame Pointer• Frame pointer (fp) points to the top

of stack for function By using the frame pointer and

storing it at the same offset for every function call, it creates a singly linked list of activation records

foo: mov ip,sp stmfd sp!,{a1 a3,fp,ip,lr,pc} sub fp,ip,#4 <computations go here> sub fp,fp,#12 ldmfd fp,{fp,sp,pc}

pc

lr

ip

fp

address0x900x8c0x880x840x800x7c0x780x740x70

fp

a3

a2

a1

ip

sp

Page 38: 嵌入式處理器架構與 程式設計

38

Backtrace

The fp register points to the stack backtrace structure for the currently executing function.

The saved fp value is (zero or) a pointer to a stack backtrace structure created by the function which called the current function.

The saved fp value in this structure is a pointer to the stack backtrace structure for the function that called the function that called the current function; and so on back until the first function.

Page 39: 嵌入式處理器架構與 程式設計

39

Creating the “Backtrace” Structure

MOV ip, sp

STMFD sp!,{a1 a4,v1 v7,fp,ip,sp,lr,pc}

SUB fp, ip, #4

sub fp, fp, #16

LDMFD fp, {fp,sp,sb,pc}

SPbefore

address0x900x8c0x880x840x800x7c0x780x740x700x6c0x680x640x600x5c0x580x540x50

SPcurrent

FPafter (saved) pc(saved) lr(saved) sp(saved) ip(saved) fp

v7v6v5v4v3v2v1a4a3a2a1

IPcurrent

Page 40: 嵌入式處理器架構與 程式設計

40

Example Backtrace

(saved) pc(saved) lr(saved) sp(saved) ip(saved) fp

v7v6v5v4v3v2v1a4a3a2a1

(saved) pc(saved) lr(saved) sp(saved) ip(saved) fp

v7v6v5v4v3v2v1a4a3a2a1

(saved) pc(saved) lr(saved) sp(saved) ip(saved) fp

v7v6v5v4v3v2v1a4a3a2a1

fp

bar’s framefoo’s frame

main’s frame

Page 41: 嵌入式處理器架構與 程式設計

41

Exercise #1

Write an assembly subroutine that implements the quicksort algorithm to sort a list of unsigned integer values. The first entry in the list is the list’s length. void quickSort(unsigned int *list);

Input Output

list: 0x00000005 0x00000005

0xA356A101 0x09250037

0xE235C203 0x29567322

0x7A35B310 0x7A35B310

0x09250037 0xA356A101

0x29567322 0xE235C203

Page 42: 嵌入式處理器架構與 程式設計

42

Exercise #2 Write an assembly subroutine that deletes an item

from an ordered list of unsigned values if it is not already there. The first entry in the list is the list’s length. void removeItem(unsigned int item, unsigned int *list);

Input Outputitem: 0x7A35B310list: 0x00000005 0x00000004

0x09250037 0x092500370x29567322 0x295673220x7A35B310 0xA356A1010xA356A101 0xE235C2030xE235C203

Page 43: 嵌入式處理器架構與 程式設計

43

Outline

Assembly Programming Assembly-C Interface Peephole Optimization

Page 44: 嵌入式處理器架構與 程式設計

44

Peephole Optimization

Final pass over generated code: Examine a few consecutive instructions: 2 to 4

See if an obvious replacement is possible: store/load pairs

MOV %eax => mema

MOV mema => %eax Can eliminate the second instruction without needing

any global knowledge of mema Use algebraic identities Special-case individual instructions

Page 45: 嵌入式處理器架構與 程式設計

45

Algebraic Identities

Worth recognizing single instructions with a constant operand: A * 2 = A + A A * 1 = A A * 0 = 0 A / 1 = A

More delicate with floating-point

Page 46: 嵌入式處理器架構與 程式設計

46

Is this ever helpful?

Why would anyone write X * 1? Why bother to correct such obvious junk

code? In fact one might write

#define MAX_TASKS 1...a = b * MAX_TASKS;

Also, seemingly redundant code can be produced by other optimizations. This is an important effect.

Page 47: 嵌入式處理器架構與 程式設計

47

Replace Multiply by Shift

A := A * 4; Can be replaced by 2-bit left shift

(signed/unsigned) But must worry about overflow if language

does A := A / 4;

If unsigned, can replace with shift right But shift right arithmetic is a well-known

problem Language may allow it anyway (traditional C)

Page 48: 嵌入式處理器架構與 程式設計

48

Addition Chains for Multiplication

If multiply is very slow (or on a machine with no multiply instruction like the original SPARC), decomposing a constant operand into sum of powers of two can be effective: X * 125 = x * 128 – x * 4 + x Two shifts, one subtract and one add, which

may be faster than one multiply Note similarity with efficient exponentiation

method

Page 49: 嵌入式處理器架構與 程式設計

49

The Right Shift Problem

Arithmetic Right shift: Shift right and use sign bit to fill most

significant bits -5 111111...1111111011 SAR 111111...1111111101 Which is -3, not -2 In most languages -5/2 = -2 Prior to C99, implementations were allowed to

truncate towards or away from zero if either operand was negative

Page 50: 嵌入式處理器架構與 程式設計

50

Folding Jumps to Jumps

A jump to an unconditional jump can copy the target address

JNE lab1

...

lab1 JMP lab2 Can be replaced by

JNE lab2 As a result, lab1 may become dead

(unreferenced)

Page 51: 嵌入式處理器架構與 程式設計

51

Jump to Return

A jump to a return can be replaced by a return

JMP lab1...

lab1 RET Can be replaced by

RET lab1 may become dead code

Page 52: 嵌入式處理器架構與 程式設計

52

Tail Recursion Elimination1

A subprogram is tail-recursive if the last computation is a call to itself:

function last (lis : list_type) return lis_type is

begin

if lis.next = null then return lis;

else return last (lis.next);

end; Recursive call can be replaced with lis := lis.next;

goto start; -- added label

Page 53: 嵌入式處理器架構與 程式設計

53

Tail Recursion Elimination2

Saves time: an assignment and jump is faster than a call with one parameter

Saves stack space: converts linear stack usage to constant usage.

In languages with no loops, this may be a required optimization: specified in Scheme standard.

Page 54: 嵌入式處理器架構與 程式設計

54

Tail Recursion Elimination3

Consider the sequence on the x86:CALL funcRET

CALL pushes return point on stack, RET in body of func removes it, RET in caller returns

Can generate instead:JMP func

Now RET in func returns to original caller, because single return address on stack

Page 55: 嵌入式處理器架構與 程式設計

55

The REALIA COBOL Compiler1

Full compiler for Standard COBOL, targeted to the IBM PC.

Now distributed by Computer Associates Runs in 150K bytes, but must be able to

handle very large programs that run on mainframes

Page 56: 嵌入式處理器架構與 程式設計

56

The REALIA COBOL Compiler2

No global optimization possible: multiple linear passes over code, no global data structures, no flow graph.

Multiple peephole optimizations, compiler iterates until code is stable. Each pass scan code backwards to minimize address recomputations

Page 57: 嵌入式處理器架構與 程式設計

57

Typical COBOL Code

Process-Balance. if Balance is negative then perform Send-Bill else perform Record-Credit end-if.Send-Bill. ...Record-Credit. ...

Page 58: 嵌入式處理器架構與 程式設計

58

Simple Assembly

Pb: cmp balance, 0

jnl L1

call Sb

jmp L2 -- jump to return

L1: call Rc

L2: ret

Sb: …

ret

Rc: …

ret

Page 59: 嵌入式處理器架構與 程式設計

59

Fold Jump to Return Statement

Pb: cmp balance, 0

jnl L1

call Sb -- tail recursion

ret -- folded

L1: call Rc -- tail recursion

L2: ret

Sb: …

ret

Rc: …

ret

Page 60: 嵌入式處理器架構與 程式設計

60

Eliminate Tail Recursion

Pb: cmp balance, 0

jnl L1 -- jump to unconditional jump

imp Sb

ret

L1: jmp Rc -- will become useless

L2: ret

Sb: …

ret

Rc: …

ret

Page 61: 嵌入式處理器架構與 程式設計

61

Corresponding Assembly

Pb: cmp balance, 0

jnl Rc -- folded

jmp Sb

ret -- unreachable

L1: jmp Rc -- unreachable

L2: ret -- unreachable

Sb: …

ret

Rc: …

ret

Page 62: 嵌入式處理器架構與 程式設計

62

Remove Dead Code

Pb: cmp balance, 0

jnl Rc

jmp Sb -- jump to next instruction

Sb: …

ret

Rc: …

ret

Page 63: 嵌入式處理器架構與 程式設計

63

Final Code

Pb: cmp balance, 0

jnl Rc

Sb: …

ret

Rc: …

ret Final code as efficient as inlining. All transformations are local. Each optimization

may yield further optimization opportunities. Iterate till no further change.

Page 64: 嵌入式處理器架構與 程式設計

64

Arcane Tricks

Consider typical maximum computationif A >= B then

C := A;

else

C := B;

end if; For simplicity assume all unsigned, and all in

registers

Page 65: 嵌入式處理器架構與 程式設計

65

Eliminating Max Jump on x86

Simple-minded assembly codeCMP A, B

JNAE L1

MOVA=>C

JMP L2

L1: MOVB=>C

L2: One jump in either case

Page 66: 嵌入式處理器架構與 程式設計

66

Computing Max without Jumps Architecture-specific trick: use subtract with borrow

instruction and carry flag CMP A, B ; CF=1 if B > A, CF = 0 if A >= B

SBB %eax, %eax ; all 1's if B > A, all 0's if A >= BMOV %eax, CNOT C ; all 0's if B > A, all 1's if A >= BAND B=>%eax ; B if B>A, 0 if A>=BAND A=>C ; 0 if B >A, A if A>=BOR %eax=>C ; B if B>A, A if A>=B

More instructions, but NO JUMPS Supercompiler: exhaustive search of instruction patterns to

uncover similar tricks