Templates in C++ do not hinder optimizations, such as auto-vectorization, contrary to what I expected.
Example:
The generated assembly file (by GCC 4.9) will be:
Example:
// /dev/bin/g++ -m64 -S -std=c++11 -march=native -O3 test.cpp #includeconstexpr int SIZE = (1L << 16); /* void test1(float *__restrict__ a, float *__restrict__ b) { int i; float *x = (float*)__builtin_assume_aligned(a, 16); float *y = (float*)__builtin_assume_aligned(b, 16); for (i = 0; i < SIZE; i++) { x[i] += y[i]; } } */ template void test1(T *__restrict__ a, T *__restrict__ b) { int i; T *x = (T*)__builtin_assume_aligned(a, 16); T *y = (T*)__builtin_assume_aligned(b, 16); for (i = 0; i < SIZE; i++) { x[i] += y[i]; } } template void test1 (float * __restrict__ a, float * __restrict__ b); template void test1 (double * __restrict__ a, double * __restrict__ b); template void test1 (int * __restrict__ a, int * __restrict__ b); template void test1 (char * __restrict__ a, char * __restrict__ b);
The generated assembly file (by GCC 4.9) will be:
.text .align 4,0x90 .globl void test1(float*, float*) void test1 (float*, float*): LFB237: xorl %eax, %eax .align 4,0x90 L2: movaps (%rdi,%rax), %xmm0 addps (%rsi,%rax), %xmm0 movaps %xmm0, (%rdi,%rax) addq $16, %rax cmpq $262144, %rax jne L2 rep; ret LFE237: .align 4,0x90 .globl void test1 (double*, double*) void test1 (double*, double*): LFB238: xorl %eax, %eax .align 4,0x90 L6: movapd (%rdi,%rax), %xmm0 addpd (%rsi,%rax), %xmm0 movapd %xmm0, (%rdi,%rax) addq $16, %rax cmpq $524288, %rax jne L6 rep; ret LFE238: .align 4,0x90 .globl void test1 (int*, int*) void test1 (int*, int*): LFB239: xorl %eax, %eax .align 4,0x90 L9: movdqa (%rdi,%rax), %xmm0 paddd (%rsi,%rax), %xmm0 movdqa %xmm0, (%rdi,%rax) addq $16, %rax cmpq $262144, %rax jne L9 rep; ret LFE239: .align 4,0x90 .globl void test1 (char*, char*) void test1 (char*, char*): LFB240: xorl %eax, %eax .align 4,0x90 L12: movdqa (%rsi,%rax), %xmm0 paddb (%rdi,%rax), %xmm0 movdqa %xmm0, (%rdi,%rax) addq $16, %rax cmpq $65536, %rax jne L12 rep; ret LFE240: .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support EH_frame1: .set L$set$0,LECIE1-LSCIE1 .long L$set$0 LSCIE1: .long 0 .byte 0x1 .ascii "zR\0" .byte 0x1 .byte 0x78 .byte 0x10 .byte 0x1 .byte 0x10 .byte 0xc .byte 0x7 .byte 0x8 .byte 0x90 .byte 0x1 .align 3 LECIE1: LSFDE1: .set L$set$1,LEFDE1-LASFDE1 .long L$set$1 LASFDE1: .long LASFDE1-EH_frame1 .quad LFB237-. .set L$set$2,LFE237-LFB237 .quad L$set$2 .byte 0 .align 3 LEFDE1: LSFDE3: .set L$set$3,LEFDE3-LASFDE3 .long L$set$3 LASFDE3: .long LASFDE3-EH_frame1 .quad LFB238-. .set L$set$4,LFE238-LFB238 .quad L$set$4 .byte 0 .align 3 LEFDE3: LSFDE5: .set L$set$5,LEFDE5-LASFDE5 .long L$set$5 LASFDE5: .long LASFDE5-EH_frame1 .quad LFB239-. .set L$set$6,LFE239-LFB239 .quad L$set$6 .byte 0 .align 3 LEFDE5: LSFDE7: .set L$set$7,LEFDE7-LASFDE7 .long L$set$7 LASFDE7: .long LASFDE7-EH_frame1 .quad LFB240-. .set L$set$8,LFE240-LFB240 .quad L$set$8 .byte 0 .align 3 LEFDE7: .constructor .destructor .align 1 .subsections_via_symbols
No comments:
Post a Comment