嵌入式處理器架構與程式設計

嵌入式處理器架構與程式設計

王建民中央研究院資訊所

2008 年 7 月

2

Contents Introduction Computer Architecture ARM Architecture Development Tools GNU Development Tools ARM Instruction Set ARM Assembly Language ARM Assembly Programming GNU ARM ToolChain Interrupts and Monitor

Lecture 8ARM Assembly Programming

4

Outline

Assembly Programming Assembly-C Interface Peephole Optimization

5

Example #4: String Length#include <stdio.h>extern int mystrlen(char *s);int main(){ char s[20] = “Hello, World!\n”; printf("The length of the string is %d\n", mystrlen(s));}

int mystrlen(char *s1){ char *s2; s2 = s1; while (*s2 != 0) { s2++; } return (s2-s1);}

6

Example #4: Pseudo Codeint mystrlen(char *s1)

{

char *s2;

s2 = s1;

while (*s2 != 0) {

s2++;

}

return (s2-s1);

}

mystrlen:

s2 = s1

start_loop:

if (*s2 == 0) goto end_loop

s2 = s2 + 1

goto start_loop

end_loop:

return (s2-s1)

7

Example #4: Storage Assignmentmystrlen:

s2 = s1

start_loop:

if (*s2 == 0) goto end_loop

s2 = s2 + 1

goto start_loop

end_loop:

return (s2-s1)

mystrlen:

r4 = r0

start_loop:

r5 = *r4

if (r5 == 0) goto end_loop

s4 = r4 + 1

goto start_loop

end_loop:

return (r4-r0)

8

Example #4: Final Assembly Code

mystrlen:

r4 = r0

start_loop:

r5 = *r4

if (r5 == 0) goto end_loop

r4 = r4 + 1

goto start_loop

end_loop:

return (r4-r0)

.text

.align 2

.global mystrlen

mystrlen:

mov r4, r0

start_loop:

ldrb r5, [r4]

cmp r5, #0

beq end_loop

add r4, r4, #1

b start_loop

end_loop:

sub r0, r4, r0

mov pc, lr

9

Example #5: Summation#include <stdio.h>extern int mysum(int n, int *array);int main(){ int a[5] = {1, 3, 5, 7, 9}; printf("The summation of the array is %d\n", mysum(5,a));}

int mysum(int n, int *array){ int i, sum; sum = 0; for (i = 0; i < n; i++) { sum += array[i]; } return sum;}

10

Example #5: Pseudo Codeint mysum(int n, int *array)

{

int i, sum;

sum = 0;

for (i = 0; i < n; i++) {

sum += array[i];

}

return sum;

}

mysum:

sum = 0

i = 0

start_loop:

if (i >= n) goto end_loop

sum = sum + array[i]

i = i + 1

goto start_loop

end_loop:

return sum

11

Example #5: Storage Assignmentmysum:

sum = 0

i = 0

start_loop:

if (i >= n) goto end_loop

sum = sum + array[i]

i = i + 1

goto start_loop

end_loop:

return sum

mysum:

r5 = 0

r4 = 0

start_loop:

if (r4 >= r0) goto end_loop

r6 = r1[r4]

r5 = r5 + r6

r4 = r4 + 1

goto start_loop

end_loop:

return r5

12

Example #5: Final Assembly Code

mysum: r5 = 0 r4 = 0start_loop: if (r4 >= r0) goto end_loop

r6 = r1[r4] r5 = r5 + r6 r4 = r4 + 1 goto start_loopend_loop: return r5

.text .align 2 .global mysummysum: mov r5, #0 mov r4, #0start_loop: cmp r4, r0 bge end_loop ldr r6, [r1,r4,LSL#2] add r5, r5, r6 add r4, r4, #1 b start_loopend_loop: mov r0, r5 mov pc, lr

13

Example #6: Bubble Sort1

#include <stdio.h>

extern void bubble(int n, int *a);

int main()

{

int i;

int a[5] = {9, 7, 5, 3, 1};

bubble(5, a);

printf("The sorted array:\n");

for (i = 0; i < 5; i++) {

printf("a[%d] = %d\n", i, a[i]);

}

}

14

Example #6: Bubble Sort2

void sort2(int *a, int *b){ int tmp; if (*b < *a) { tmp = *a; *a = *b; *b = tmp; }}

void bubble(int n, int *a){ int i, j; for (i = 0; i < n-1; i++) { for (j = 0; j < n-1-i; j++) { sort2(&a[j], &a[j+1]); } }}

15

Example #6: Pseudo Codevoid bubble(int n, int *a);{ int i, j; for (i = 0; i < n-1; i++) {

for (j = 0; j < n-1-i; j++) {

sort2(&a[j], &a[j+1]); }

}

}

bubble:

i = 0start_outer: if (i >= n-1) goto end_outer j = 0start_inner: if (j >= n-1-i) goto end_inner sort2(&a[j],&a[j+1]) j = j + 1 goto start_innerend_inner: i = i + 1 goto start_outerend_outer: return

16

Example #6: Storage Assignmentbubble: i = 0start_outer: if (i >= n-1) goto end_outer

j = 0start_inner: if (j >= n-1-i) goto end_inner

sort2(&a[j],&a[j+1]) j = j + 1 goto start_innerend_inner: i = i + 1 goto start_outerend_outer: return

bubble: r2 = 0start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer r3 = 0start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner sort2(r1+r3*4,r1+r3*4+4) r3 = r3 + 1 goto start_innerend_inner: r2 = r2 + 1 goto start_outerend_outer: return

17

Example #6: Assembly Code?bubble: r2 = 0start_outer: r4 = r0 - 1 if (r2 >= r4) goto end_outer

r3 = 0start_inner: r5 = r4 – r2 if (r3 >= r5) goto end_inner

sort2(r1+r3*4,r1+r3*4+4)

r3 = r3 + 1 goto start_innerend_inner: r2 = r2 + 1 goto start_outerend_outer: return

bubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr

18

Example #6: Final Assembly Codebubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner

add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2

add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr

bubble: mov r2, #0start_outer: sub r4, r0, #1 cmp r2, r4 bge end_outer mov r3, #0start_inner: sub r5, r4, r2 cmp r3, r5 bge end_inner stmfd sp!,{r0-r3,lr} add r0, r1, r3, LSL #2 add r1, r0, #4 bl sort2 ldmfd sp, {r0-r3,lr} add r3, r3, #1 b start_innerend_inner: add r2, r2, #1 b start_outerend_outer: mov pc, lr

19

Outline


20

Generating Assembly Code from C

In this course, we will be using the GNU ARM ToolChain.

To compile a C program to assembly code arm-elf-gcc –S filename.c When you compile a .c file, you get a .s file This .s file contains the assembly language code

When assembled, this code can potentially be linked and loaded as an executable

To display information from an object file arm-elf-objdump –S –r filename

21

Example #7: A Simple Programint a, b;

int main()

{

a = 3;

b = 4;

} /* end main() */

.file "example4.c" .text .align 2 .global main .type main, %functionmain: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r2, .L3 mov r3, #3 str r3, [r2, #0] ldr r2, .L3+4 mov r3, #4 str r3, [r2, #0] ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word a .word b .size main, .-main .comm a,4,4 .comm b,4,4 .ident "GCC: (GNU) 4.0.0"Declare storage for a and b

Loader will put addresses of a and b in this memory location

22

Example #7: Object Fileexample1.o: file format elf32-littlearm

Disassembly of section .text:

00000000 <main>: 0: e1a0c00d mov ip, sp 4: e92dd800 stmdb sp!, {fp, ip, lr, pc} 8: e24cb004 sub fp, ip, #4 ; 0x4 c: e59f2014 ldr r2, [pc, #20] ; 28 <.text+0x28> 10: e3a03003 mov r3, #3 ; 0x3 14: e5823000 str r3, [r2] 18: e59f200c ldr r2, [pc, #12] ; 2c <.text+0x2c> 1c: e3a03004 mov r3, #4 ; 0x4 20: e5823000 str r3, [r2] 24: e89da800 ldmia sp, {fp, sp, pc}

...28: R_ARM_ABS32 a2c: R_ARM_ABS32 b

23

Example #7: Executable File00008208 <main>:

8208: e1a0c00d mov ip, sp

820c: e92dd800 stmdb sp!, {fp, ip, lr, pc}

8210: e24cb004 sub fp, ip, #4 ; 0x4

8214: e59f2014 ldr r2, [pc, #20] ; 8230 <.text+0x210>

8218: e3a03003 mov r3, #3 ; 0x3

821c: e5823000 str r3, [r2]

8220: e59f200c ldr r2, [pc, #12] ; 8234 <.text+0x214>

8224: e3a03004 mov r3, #4 ; 0x4

8228: e5823000 str r3, [r2]

822c: e89da800 ldmia sp, {fp, sp, pc}

8230: 0000adc4 andeq sl, r0, r4, asr #27

8234: 0000adc0 andeq sl, r0, r0, asr #27

24

Example #8: Calling A Functionint tmp; void swap(int a, int b);int main() {

int a, b; a = 3; b = 4; swap(a, b);

} /* end main() */

void swap(int a, int b) {

tmp = a; a = b; b = tmp;

} /* end swap() */

25

Example #8: Assembly Listingmain: mov ip, sp stmfd sp!, {fp, ip, lr,

pc} sub fp, ip, #4 sub sp, sp, #8 mov r3, #3 str r3, [fp, #-20] mov r3, #4 str r3, [fp, #-16] ldr r0, [fp, #-20] ldr r1, [fp, #-16] bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc}

swap: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #8 str r0, [fp, #-16] str r1, [fp, #-20] ldr r2, .L5 ldr r3, [fp, #-16] str r3, [r2, #0] ldr r3, [fp, #-20] str r3, [fp, #-16] ldr r3, .L5 ldr r3, [r3, #0] str r3, [fp, #-20] sub sp, fp, #12 ldmfd sp, {fp, sp, pc}.L6: .align 2.L5: .word tmp .comm tmp,4,4

26

Example #9: Manipulating Pointersint tmp; int *pa, *pb; void swap(int a, int b); int main() { int a, b; pa = &a; pb = &b; *pa = 3; *pb = 4; swap(*pa, *pb); } /* end main() */

void swap(int a, int b) { tmp = a; a = b; b = tmp; } /* end swap() */

27

Example #9: Assembly Listingmain: mov ip, sp stmfd sp!, {fp, ip, lr,

pc} sub fp, ip, #4 sub sp, sp, #8 ldr r2, .L3 sub r3, fp, #16 str r3, [r2, #0] ldr r2, .L3+4 sub r3, fp, #20 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #3 str r3, [r2, #0] ldr r3, .L3+4 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0]

ldr r3, .L3 ldr r3, [r3, #0] ldr r2, [r3, #0] ldr r3, .L3+4 ldr r3, [r3, #0] ldr r3, [r3, #0] mov r0, r2 mov r1, r3 bl swap sub sp, fp, #12 ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word pa .word pb

28

Example #10: Dealing with structtypedef struct testStruct

{

unsigned int a;

unsigned int b;

char c;

} testStruct;

testStruct *ptest;

int main()

{

ptest >a = 4;

ptest >b = 10;

ptest >c = 'A';

} /* end main() */

main: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 ldr r3, .L3 ldr r2, [r3, #0] mov r3, #4 str r3, [r2, #0] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #10 str r3, [r2, #4] ldr r3, .L3 ldr r2, [r3, #0] mov r3, #65 strb r3, [r2, #8] ldmfd sp, {fp, sp, pc}.L4: .align 2.L3: .word ptest

29

Example #11: Passing Argumentsint tmp; void test(int a, int b, int c, int d, int *e); int main() { int a, b, c, d, e; a = 3; b = 4; c = 5; d = 6; e = 7; test(a, b, c, d, &e); } /* end main() */

void test(int a, int b, int c, int d, int *e) { tmp = a; a = b; b = tmp; c = b; b = d; *e = d; } /* end test() */

30

Example #11: Assembly Listing1

main:

mov ip, sp

stmfd sp!, {fp, ip, lr, pc}

sub fp, ip, #4

sub sp, sp, #24

mov r3, #3

str r3, [fp, #-28]

mov r3, #4

str r3, [fp, #-24]

mov r3, #5

str r3, [fp, #-20]

mov r3, #6

str r3, [fp, #-16]

mov r3, #7

str r3, [fp, #-32]

sub r3, fp, #32

str r3, [sp, #0]

ldr r0, [fp, #-28]

ldr r1, [fp, #-24]

ldr r2, [fp, #-20]

ldr r3, [fp, #-16]

bl test

sub sp, fp, #12

ldmfd sp, {fp, sp, pc}

31

Example #11: Assembly Listing2

test:

mov ip, sp

stmfd sp!, {fp, ip, lr, pc}

sub fp, ip, #4

sub sp, sp, #16

str r0, [fp, #-16]

str r1, [fp, #-20]

str r2, [fp, #-24]

str r3, [fp, #-28]

ldr r2, .L5

ldr r3, [fp, #-16]

str r3, [r2, #0]

ldr r3, [fp, #-20]

str r3, [fp, #-16]

ldr r3, .L5

ldr r3, [r3, #0]

str r3, [fp, #-20]

ldr r3, [fp, #-20]

str r3, [fp, #-24]

ldr r3, [fp, #-28]

str r3, [fp, #-20]

ldr r2, [fp, #4]

ldr r3, [fp, #-28]

str r3, [r2, #0]

sub sp, fp, #12

ldmfd sp, {fp, sp, pc}

.L6:

.align 2

.L5:

.word tmp

32

Interfacing C and Assembly ARM has developed a standard called the “ARM

Procedure Call Standard” (APCS) which defines: constraints on the use of registers stack conventions format of a stack backtrace data structure argument passing and result return support for ARM shared library mechanism

Compiler generated code conforms to the APCS It's just a standard not an architectural requirement Cannot avoid standard when interfacing C and

assembly code Can avoid standard when just writing assembly code or

when writing assembly code that isn't called by C code

33

Register Names and UseRegister # APCS Name APCS Role

R0 a1 argument 1

R1 a2 argument 2

R2 a3 argument 3

R3 a4 argument 4

R4..R8 v1..v5 register variables

R9 sb/v6 static base/register variable

R10 sl/v7 stack limit/register variable

R11 fp frame pointer

R12 ip scratch reg/new sb in inter link unit calls

R13 sp low end of current stack frame

R14 lr link address/scratch register

R15 pc program counter

34

How Does STM Work on Memory ?

STM sp!, {r0 r15} The ARM processor

uses a bit-vector to represent each register to be saved

The architecture places the lowest number register into the lowest address

Default STM == STMDB == STMFD

pclrsp

SPbefore

address0x900x8c0x880x840x800x7c0x780x740x700x6c0x680x640x600x5c0x580x540x50

ipfpv7v6v5v4v3v2v1a4a3a2a1SPafter

35

Passing and Returning Structures

Structures are usually passed in registers (and overflow onto the stack when necessary)

When a function returns a struct, a pointer to where the struct result is to be placed is passed in a1 (first argument)

Example struct s f(int x);

is compiled as

void f(struct s *result, int x);

36

Example #12: Passing Structurestypedef struct two_ch_struct{

char ch1;

char ch2;

} two_ch;

two_ch max(two_ch a, two_ch b){

return((a.ch1 > b.ch1)?a:b);

} /* end max() */

max: mov ip, sp stmfd sp!, {fp, ip, lr, pc} sub fp, ip, #4 sub sp, sp, #12 str r0, [fp, #-24] str r1, [fp, #-16] str r2, [fp, #-20] ldrb r2, [fp, #-16] ldrb r3, [fp, #-20] cmp r2, r3 bls .L2 ldr r3, [fp, #-16] ldr r2, [fp, #-24] str r3, [r2, #0] b .L1.L2: ldr r3, [fp, #-20] ldr r2, [fp, #-24] str r3, [r2, #0].L1: ldr r0, [fp, #-24] sub sp, fp, #12 ldmfd sp, {fp, sp, pc}

37

The Frame Pointer• Frame pointer (fp) points to the top

of stack for function By using the frame pointer and

storing it at the same offset for every function call, it creates a singly linked list of activation records

foo: mov ip,sp stmfd sp!,{a1 a3,fp,ip,lr,pc} sub fp,ip,#4 <computations go here> sub fp,fp,#12 ldmfd fp,{fp,sp,pc}

pc

lr

ip

fp

address0x900x8c0x880x840x800x7c0x780x740x70

fp

a3

a2

a1

ip

sp

38

Backtrace

The fp register points to the stack backtrace structure for the currently executing function.

The saved fp value is (zero or) a pointer to a stack backtrace structure created by the function which called the current function.

The saved fp value in this structure is a pointer to the stack backtrace structure for the function that called the function that called the current function; and so on back until the first function.

39

Creating the “Backtrace” Structure

MOV ip, sp

STMFD sp!,{a1 a4,v1 v7,fp,ip,sp,lr,pc}

SUB fp, ip, #4

…

…

sub fp, fp, #16

LDMFD fp, {fp,sp,sb,pc}

SPbefore

address0x900x8c0x880x840x800x7c0x780x740x700x6c0x680x640x600x5c0x580x540x50

SPcurrent

FPafter (saved) pc(saved) lr(saved) sp(saved) ip(saved) fp

v7v6v5v4v3v2v1a4a3a2a1

IPcurrent

40

Example Backtrace

(saved) pc(saved) lr(saved) sp(saved) ip(saved) fp






fp

bar’s framefoo’s frame

main’s frame

41

Exercise #1

Write an assembly subroutine that implements the quicksort algorithm to sort a list of unsigned integer values. The first entry in the list is the list’s length. void quickSort(unsigned int *list);

Input Output

list: 0x00000005 0x00000005

0xA356A101 0x09250037

0xE235C203 0x29567322

0x7A35B310 0x7A35B310

0x09250037 0xA356A101

0x29567322 0xE235C203

42

Exercise #2 Write an assembly subroutine that deletes an item

from an ordered list of unsigned values if it is not already there. The first entry in the list is the list’s length. void removeItem(unsigned int item, unsigned int *list);

Input Outputitem: 0x7A35B310list: 0x00000005 0x00000004

0x09250037 0x092500370x29567322 0x295673220x7A35B310 0xA356A1010xA356A101 0xE235C2030xE235C203

43

Outline


44

Peephole Optimization

Final pass over generated code: Examine a few consecutive instructions: 2 to 4

See if an obvious replacement is possible: store/load pairs

MOV %eax => mema

MOV mema => %eax Can eliminate the second instruction without needing

any global knowledge of mema Use algebraic identities Special-case individual instructions

45

Algebraic Identities

Worth recognizing single instructions with a constant operand: A * 2 = A + A A * 1 = A A * 0 = 0 A / 1 = A

More delicate with floating-point

46

Is this ever helpful?

Why would anyone write X * 1? Why bother to correct such obvious junk

code? In fact one might write

#define MAX_TASKS 1...a = b * MAX_TASKS;

Also, seemingly redundant code can be produced by other optimizations. This is an important effect.

47

Replace Multiply by Shift

A := A * 4; Can be replaced by 2-bit left shift

(signed/unsigned) But must worry about overflow if language

does A := A / 4;

If unsigned, can replace with shift right But shift right arithmetic is a well-known

problem Language may allow it anyway (traditional C)

48

Addition Chains for Multiplication

If multiply is very slow (or on a machine with no multiply instruction like the original SPARC), decomposing a constant operand into sum of powers of two can be effective: X * 125 = x * 128 – x * 4 + x Two shifts, one subtract and one add, which

may be faster than one multiply Note similarity with efficient exponentiation

method

49

The Right Shift Problem

Arithmetic Right shift: Shift right and use sign bit to fill most

significant bits -5 111111...1111111011 SAR 111111...1111111101 Which is -3, not -2 In most languages -5/2 = -2 Prior to C99, implementations were allowed to

truncate towards or away from zero if either operand was negative

50

Folding Jumps to Jumps

A jump to an unconditional jump can copy the target address

JNE lab1

...

lab1 JMP lab2 Can be replaced by

JNE lab2 As a result, lab1 may become dead

(unreferenced)

51

Jump to Return

A jump to a return can be replaced by a return

JMP lab1...

lab1 RET Can be replaced by

RET lab1 may become dead code

52

Tail Recursion Elimination1

A subprogram is tail-recursive if the last computation is a call to itself:

function last (lis : list_type) return lis_type is

begin

if lis.next = null then return lis;

else return last (lis.next);

end; Recursive call can be replaced with lis := lis.next;

goto start; -- added label

53


Saves time: an assignment and jump is faster than a call with one parameter

Saves stack space: converts linear stack usage to constant usage.

In languages with no loops, this may be a required optimization: specified in Scheme standard.

54


Consider the sequence on the x86:CALL funcRET

CALL pushes return point on stack, RET in body of func removes it, RET in caller returns

Can generate instead:JMP func

Now RET in func returns to original caller, because single return address on stack

55

The REALIA COBOL Compiler1

Full compiler for Standard COBOL, targeted to the IBM PC.

Now distributed by Computer Associates Runs in 150K bytes, but must be able to

handle very large programs that run on mainframes

56

The REALIA COBOL Compiler2

No global optimization possible: multiple linear passes over code, no global data structures, no flow graph.

Multiple peephole optimizations, compiler iterates until code is stable. Each pass scan code backwards to minimize address recomputations

57

Typical COBOL Code

Process-Balance. if Balance is negative then perform Send-Bill else perform Record-Credit end-if.Send-Bill. ...Record-Credit. ...

58

Simple Assembly

Pb: cmp balance, 0

jnl L1

call Sb

jmp L2 -- jump to return

L1: call Rc

L2: ret

Sb: …

ret

Rc: …

ret

59

Fold Jump to Return Statement

Pb: cmp balance, 0

jnl L1

call Sb -- tail recursion

ret -- folded

L1: call Rc -- tail recursion

L2: ret

Sb: …

ret

Rc: …

ret

60

Eliminate Tail Recursion

Pb: cmp balance, 0

jnl L1 -- jump to unconditional jump

imp Sb

ret

L1: jmp Rc -- will become useless

L2: ret

Sb: …

ret

Rc: …

ret

61

Corresponding Assembly

Pb: cmp balance, 0

jnl Rc -- folded

jmp Sb

ret -- unreachable

L1: jmp Rc -- unreachable

L2: ret -- unreachable

Sb: …

ret

Rc: …

ret

62

Remove Dead Code

Pb: cmp balance, 0

jnl Rc

jmp Sb -- jump to next instruction

Sb: …

ret

Rc: …

ret

63

Final Code

Pb: cmp balance, 0

jnl Rc

Sb: …

ret

Rc: …

ret Final code as efficient as inlining. All transformations are local. Each optimization

may yield further optimization opportunities. Iterate till no further change.

64

Arcane Tricks

Consider typical maximum computationif A >= B then

C := A;

else

C := B;

end if; For simplicity assume all unsigned, and all in

registers

65

Eliminating Max Jump on x86

Simple-minded assembly codeCMP A, B

JNAE L1

MOVA=>C

JMP L2

L1: MOVB=>C

L2: One jump in either case

66

Computing Max without Jumps Architecture-specific trick: use subtract with borrow

instruction and carry flag CMP A, B ; CF=1 if B > A, CF = 0 if A >= B

SBB %eax, %eax ; all 1's if B > A, all 0's if A >= BMOV %eax, CNOT C ; all 0's if B > A, all 1's if A >= BAND B=>%eax ; B if B>A, 0 if A>=BAND A=>C ; 0 if B >A, A if A>=BOR %eax=>C ; B if B>A, A if A>=B

More instructions, but NO JUMPS Supercompiler: exhaustive search of instruction patterns to

uncover similar tricks

Documents

嵌入式處理器架構與 程式設計

嵌入式處理器架構與程式設計