Create your own threads
gcc let you add sse instruction manual through builtin function .
Compile this with : g++ main.cpp .
To get the assembly code , use g++ -S main.cpp .
#include <iostream> #include <stdio.h> #include <math.h> typedef int v2si __attribute__( (vector_size(8))); typedef char v8qi __attribute__( (vector_size(8))); typedef int v4si __attribute__( (vector_size(16))); typedef float v4sf __attribute__( (vector_size(16))); void explicit_mmx() { std::cout << "*********** MMX addition ***********" << std::endl; v8qi a, b, c; char * Pa = (char *) &a; Pa[0] = 0; Pa[1] = 1; Pa[2] = 2; Pa[3] = 3; char * Pb = (char *) &b; Pb[0] = 0; Pb[1] = 1; Pb[2] = 2; Pb[3] = 3; char * Pc = (char *) &c; Pc[0] = 0; Pc[1] = 0; Pc[2] = 0; Pc[3] = 0; c = __builtin_ia32_paddb(b, a); c = __builtin_ia32_paddb(c, a); for (int i = 0; i < 4; i++) std::cout << i << ":" << (int) Pc[i] << std::endl; } void implicit_mmx() { std::cout << "*********** MMX addition ***********" << std::endl; v8qi a, b, c; char * Pa = (char *) &a; Pa[0] = 0; Pa[1] = 1; Pa[2] = 2; Pa[3] = 3; char * Pb = (char *) &b; Pb[0] = 0; Pb[1] = 1; Pb[2] = 2; Pb[3] = 3; char * Pc = (char *) &c; Pc[0] = 0; Pc[1] = 0; Pc[2] = 0; Pc[3] = 0; c = b + a; c = c + a; for (int i = 0; i < 4; i++) std::cout << i << ":" << (int) Pc[i] << std::endl; } void implicit_sse(int v) { std::cout << "*********** implicit SSE addition ***********" << std::endl; v4si a = {v, v, v, v}; v4si b = {1, 2, 3, 4}; v4si c = a + b; // c = c + b; int * Pc = (int *) &c; for (int i = 0; i < 4; i++) std::cout << i << ":" << (int) Pc[i] << std::endl; } void explicit_sse(float v) { std::cout << "*********** explicit SSE addition ***********" << std::endl; v4sf U; v4sf V; v4sf W; float * PU = (float *) &U; PU[0] = 1 * v; PU[1] = 2 * v; PU[2] = 3 * v; PU[3] = 4 * v; float * PV = (float *) &V; PV[0] = 1; PV[1] = 2; PV[2] = 2; PV[3] = 3; float * PW = (float *) &W; PW[0] = 0; PW[1] = 0; PW[2] = 0; PW[3] = 0; W = __builtin_ia32_addps(U, V); for (int i = 0; i < 4; i++) std::cout << i << ":" << (float) PW[i] << std::endl; } int main() { explicit_mmx(); implicit_mmx(); //re-allow floating point operations ( as MMX and FPU register are shared) __builtin_ia32_emms(); implicit_sse(2); explicit_sse(3.141592); getchar(); return 0; };
.file "main.cpp" .local _ZStL8__ioinit .comm _ZStL8__ioinit,1,1 .section .rodata .align 8 .LC0: .string "*********** MMX addition ***********" .LC1: .string ":" .text .globl _Z12explicit_mmxv .type _Z12explicit_mmxv, @function _Z12explicit_mmxv: .LFB966: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %rbx subq $88, %rsp movl $.LC0, %esi movl $_ZSt4cout, %edi .cfi_offset 3, -24 call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E leaq -56(%rbp), %rax movq %rax, -32(%rbp) movq -32(%rbp), %rax movb $0, (%rax) movq -32(%rbp), %rax addq $1, %rax movb $1, (%rax) movq -32(%rbp), %rax addq $2, %rax movb $2, (%rax) movq -32(%rbp), %rax addq $3, %rax movb $3, (%rax) leaq -64(%rbp), %rax movq %rax, -40(%rbp) movq -40(%rbp), %rax movb $0, (%rax) movq -40(%rbp), %rax addq $1, %rax movb $1, (%rax) movq -40(%rbp), %rax addq $2, %rax movb $2, (%rax) movq -40(%rbp), %rax addq $3, %rax movb $3, (%rax) leaq -72(%rbp), %rax movq %rax, -48(%rbp) movq -48(%rbp), %rax movb $0, (%rax) movq -48(%rbp), %rax addq $1, %rax movb $0, (%rax) movq -48(%rbp), %rax addq $2, %rax movb $0, (%rax) movq -48(%rbp), %rax addq $3, %rax movb $0, (%rax) movq -56(%rbp), %mm0 movq -64(%rbp), %mm1 paddb %mm1, %mm0 movq %mm0, -88(%rbp) movq -88(%rbp), %rax movq %rax, -72(%rbp) movq -56(%rbp), %mm0 movq -72(%rbp), %mm1 paddb %mm1, %mm0 movq %mm0, -88(%rbp) movq -88(%rbp), %rax movq %rax, -72(%rbp) movl $0, -20(%rbp) jmp .L2 .L3: movl -20(%rbp), %eax cltq addq -48(%rbp), %rax movzbl (%rax), %eax movsbl %al, %ebx movl -20(%rbp), %eax movl %eax, %esi movl $_ZSt4cout, %edi call _ZNSolsEi movl $.LC1, %esi movq %rax, %rdi call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl %ebx, %esi movq %rax, %rdi call _ZNSolsEi movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E addl $1, -20(%rbp) .L2: cmpl $3, -20(%rbp) setle %al testb %al, %al jne .L3 addq $88, %rsp popq %rbx popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE966: .size _Z12explicit_mmxv, .-_Z12explicit_mmxv .globl _Z12implicit_mmxv .type _Z12implicit_mmxv, @function _Z12implicit_mmxv: .LFB967: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %rbx subq $72, %rsp movl $.LC0, %esi movl $_ZSt4cout, %edi .cfi_offset 3, -24 call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E leaq -56(%rbp), %rax movq %rax, -32(%rbp) movq -32(%rbp), %rax movb $0, (%rax) movq -32(%rbp), %rax addq $1, %rax movb $1, (%rax) movq -32(%rbp), %rax addq $2, %rax movb $2, (%rax) movq -32(%rbp), %rax addq $3, %rax movb $3, (%rax) leaq -64(%rbp), %rax movq %rax, -40(%rbp) movq -40(%rbp), %rax movb $0, (%rax) movq -40(%rbp), %rax addq $1, %rax movb $1, (%rax) movq -40(%rbp), %rax addq $2, %rax movb $2, (%rax) movq -40(%rbp), %rax addq $3, %rax movb $3, (%rax) leaq -72(%rbp), %rax movq %rax, -48(%rbp) movq -48(%rbp), %rax movb $0, (%rax) movq -48(%rbp), %rax addq $1, %rax movb $0, (%rax) movq -48(%rbp), %rax addq $2, %rax movb $0, (%rax) movq -48(%rbp), %rax addq $3, %rax movb $0, (%rax) movq -64(%rbp), %rax movq -56(%rbp), %rdx movq %rax, %rdi xorq %rdx, %rdi movabsq $9187201950435737471, %rcx movq %rdx, %rsi andq %rcx, %rsi movabsq $9187201950435737471, %rdx movq %rax, %rcx andq %rdx, %rcx movabsq $-9187201950435737472, %rax movq %rdi, %rdx andq %rax, %rdx leaq (%rsi,%rcx), %rax xorq %rdx, %rax movq %rax, -72(%rbp) movq -72(%rbp), %rax movq -56(%rbp), %rdx movq %rax, %rdi xorq %rdx, %rdi movabsq $9187201950435737471, %rcx movq %rdx, %rsi andq %rcx, %rsi movabsq $9187201950435737471, %rdx movq %rax, %rcx andq %rdx, %rcx movabsq $-9187201950435737472, %rax movq %rdi, %rdx andq %rax, %rdx leaq (%rsi,%rcx), %rax xorq %rdx, %rax movq %rax, -72(%rbp) movl $0, -20(%rbp) jmp .L5 .L6: movl -20(%rbp), %eax cltq addq -48(%rbp), %rax movzbl (%rax), %eax movsbl %al, %ebx movl -20(%rbp), %eax movl %eax, %esi movl $_ZSt4cout, %edi call _ZNSolsEi movl $.LC1, %esi movq %rax, %rdi call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl %ebx, %esi movq %rax, %rdi call _ZNSolsEi movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E addl $1, -20(%rbp) .L5: cmpl $3, -20(%rbp) setle %al testb %al, %al jne .L6 addq $72, %rsp popq %rbx popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE967: .size _Z12implicit_mmxv, .-_Z12implicit_mmxv .section .rodata .align 8 .LC2: .string "*********** implicit SSE addition ***********" .text .globl _Z12implicit_ssei .type _Z12implicit_ssei, @function _Z12implicit_ssei: .LFB968: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %rbx subq $104, %rsp movl %edi, -100(%rbp) movl $.LC2, %esi movl $_ZSt4cout, %edi .cfi_offset 3, -24 call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E pxor %xmm0, %xmm0 movdqa %xmm0, -48(%rbp) movl -100(%rbp), %eax movl %eax, -104(%rbp) movd -104(%rbp), %xmm1 pshufd $0, %xmm1, %xmm0 movdqa %xmm0, -48(%rbp) movdqa .LC3(%rip), %xmm0 movdqa %xmm0, -64(%rbp) movdqa -64(%rbp), %xmm0 movdqa -48(%rbp), %xmm1 paddd %xmm1, %xmm0 movdqa %xmm0, -96(%rbp) leaq -96(%rbp), %rax movq %rax, -72(%rbp) movl $0, -20(%rbp) jmp .L8 .L9: movl -20(%rbp), %eax cltq salq $2, %rax addq -72(%rbp), %rax movl (%rax), %ebx movl -20(%rbp), %eax movl %eax, %esi movl $_ZSt4cout, %edi call _ZNSolsEi movl $.LC1, %esi movq %rax, %rdi call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl %ebx, %esi movq %rax, %rdi call _ZNSolsEi movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E addl $1, -20(%rbp) .L8: cmpl $3, -20(%rbp) setle %al testb %al, %al jne .L9 addq $104, %rsp popq %rbx popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE968: .size _Z12implicit_ssei, .-_Z12implicit_ssei .section .rodata .align 8 .LC4: .string "*********** explicit SSE addition ***********" .text .globl _Z12explicit_ssef .type _Z12explicit_ssef, @function _Z12explicit_ssef: .LFB969: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $96, %rsp movss %xmm0, -84(%rbp) movl $.LC4, %esi movl $_ZSt4cout, %edi call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E leaq -48(%rbp), %rax movq %rax, -16(%rbp) movq -16(%rbp), %rax movl -84(%rbp), %edx movl %edx, (%rax) movq -16(%rbp), %rax addq $4, %rax movss -84(%rbp), %xmm0 addss %xmm0, %xmm0 movss %xmm0, (%rax) movq -16(%rbp), %rax addq $8, %rax movss -84(%rbp), %xmm1 movss .LC5(%rip), %xmm0 mulss %xmm1, %xmm0 movss %xmm0, (%rax) movq -16(%rbp), %rax addq $12, %rax movss -84(%rbp), %xmm1 movss .LC6(%rip), %xmm0 mulss %xmm1, %xmm0 movss %xmm0, (%rax) leaq -64(%rbp), %rax movq %rax, -24(%rbp) movq -24(%rbp), %rax movl $0x3f800000, %edx movl %edx, (%rax) movq -24(%rbp), %rax leaq 4(%rax), %rdx movl $0x40000000, %eax movl %eax, (%rdx) movq -24(%rbp), %rax leaq 8(%rax), %rdx movl $0x40000000, %eax movl %eax, (%rdx) movq -24(%rbp), %rax leaq 12(%rax), %rdx movl $0x40400000, %eax movl %eax, (%rdx) leaq -80(%rbp), %rax movq %rax, -32(%rbp) movq -32(%rbp), %rax movl $0x00000000, %edx movl %edx, (%rax) movq -32(%rbp), %rax leaq 4(%rax), %rdx movl $0x00000000, %eax movl %eax, (%rdx) movq -32(%rbp), %rax leaq 8(%rax), %rdx movl $0x00000000, %eax movl %eax, (%rdx) movq -32(%rbp), %rax leaq 12(%rax), %rdx movl $0x00000000, %eax movl %eax, (%rdx) movaps -64(%rbp), %xmm0 movaps -48(%rbp), %xmm1 addps %xmm1, %xmm0 movaps %xmm0, -80(%rbp) movl $0, -4(%rbp) jmp .L11 .L12: movl -4(%rbp), %eax cltq salq $2, %rax addq -32(%rbp), %rax movss (%rax), %xmm0 movss %xmm0, -88(%rbp) movl -4(%rbp), %eax movl %eax, %esi movl $_ZSt4cout, %edi call _ZNSolsEi movl $.LC1, %esi movq %rax, %rdi call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movss -88(%rbp), %xmm0 movq %rax, %rdi call _ZNSolsEf movl $_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, %esi movq %rax, %rdi call _ZNSolsEPFRSoS_E addl $1, -4(%rbp) .L11: cmpl $3, -4(%rbp) setle %al testb %al, %al jne .L12 leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE969: .size _Z12explicit_ssef, .-_Z12explicit_ssef .globl main .type main, @function main: .LFB970: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 call _Z12explicit_mmxv call _Z12implicit_mmxv emms movl $2, %edi call _Z12implicit_ssei movss .LC10(%rip), %xmm0 call _Z12explicit_ssef call getchar movl $0, %eax popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE970: .size main, .-main .type _Z41__static_initialization_and_destruction_0ii, @function _Z41__static_initialization_and_destruction_0ii: .LFB980: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $16, %rsp movl %edi, -4(%rbp) movl %esi, -8(%rbp) cmpl $1, -4(%rbp) jne .L14 cmpl $65535, -8(%rbp) jne .L14 movl $_ZStL8__ioinit, %edi call _ZNSt8ios_base4InitC1Ev movl $_ZNSt8ios_base4InitD1Ev, %eax movl $__dso_handle, %edx movl $_ZStL8__ioinit, %esi movq %rax, %rdi call __cxa_atexit .L14: leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE980: .size _Z41__static_initialization_and_destruction_0ii, .-_Z41__static_initialization_and_destruction_0ii .type _GLOBAL__sub_I__Z12explicit_mmxv, @function _GLOBAL__sub_I__Z12explicit_mmxv: .LFB981: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movl $65535, %esi movl $1, %edi call _Z41__static_initialization_and_destruction_0ii popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE981: .size _GLOBAL__sub_I__Z12explicit_mmxv, .-_GLOBAL__sub_I__Z12explicit_mmxv .section .ctors,"aw",@progbits .align 8 .quad _GLOBAL__sub_I__Z12explicit_mmxv .weakref _ZL20__gthrw_pthread_oncePiPFvvE,pthread_once .weakref _ZL27__gthrw_pthread_getspecificj,pthread_getspecific .weakref _ZL27__gthrw_pthread_setspecificjPKv,pthread_setspecific .weakref _ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_,pthread_create .weakref _ZL20__gthrw_pthread_joinmPPv,pthread_join .weakref _ZL21__gthrw_pthread_equalmm,pthread_equal .weakref _ZL20__gthrw_pthread_selfv,pthread_self .weakref _ZL22__gthrw_pthread_detachm,pthread_detach .weakref _ZL22__gthrw_pthread_cancelm,pthread_cancel .weakref _ZL19__gthrw_sched_yieldv,sched_yield .weakref _ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t,pthread_mutex_lock .weakref _ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t,pthread_mutex_trylock .weakref _ZL31__gthrw_pthread_mutex_timedlockP15pthread_mutex_tPK8timespec,pthread_mutex_timedlock .weakref _ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t,pthread_mutex_unlock .weakref _ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t,pthread_mutex_init .weakref _ZL29__gthrw_pthread_mutex_destroyP15pthread_mutex_t,pthread_mutex_destroy .weakref _ZL30__gthrw_pthread_cond_broadcastP14pthread_cond_t,pthread_cond_broadcast .weakref _ZL27__gthrw_pthread_cond_signalP14pthread_cond_t,pthread_cond_signal .weakref _ZL25__gthrw_pthread_cond_waitP14pthread_cond_tP15pthread_mutex_t,pthread_cond_wait .weakref _ZL30__gthrw_pthread_cond_timedwaitP14pthread_cond_tP15pthread_mutex_tPK8timespec,pthread_cond_timedwait .weakref _ZL28__gthrw_pthread_cond_destroyP14pthread_cond_t,pthread_cond_destroy .weakref _ZL26__gthrw_pthread_key_createPjPFvPvE,pthread_key_create .weakref _ZL26__gthrw_pthread_key_deletej,pthread_key_delete .weakref _ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t,pthread_mutexattr_init .weakref _ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti,pthread_mutexattr_settype .weakref _ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t,pthread_mutexattr_destroy .section .rodata .align 16 .LC3: .long 1 .long 2 .long 3 .long 4 .align 4 .LC5: .long 1077936128 .align 4 .LC6: .long 1082130432 .align 4 .LC10: .long 1078530008 .ident "GCC: (GNU) 4.6.3 20120306 (Red Hat 4.6.3-2)" .section .note.GNU-stack,"",@progbits