Saturday, April 27, 2013

Templates in C++ do not hinder optimizations, such as auto-vectorization, contrary to what I expected.

Example:


// /dev/bin/g++ -m64 -S -std=c++11 -march=native -O3 test.cpp 
#include 

constexpr int SIZE = (1L << 16);
/*
void test1(float *__restrict__ a, float *__restrict__ b)
{
  int i;

  float *x = (float*)__builtin_assume_aligned(a, 16);
  float *y = (float*)__builtin_assume_aligned(b, 16);

  for (i = 0; i < SIZE; i++)
    {
      x[i] += y[i];
    }
}
*/

template
void test1(T *__restrict__ a, T *__restrict__ b)
{
  int i;

  T *x = (T*)__builtin_assume_aligned(a, 16);
  T *y = (T*)__builtin_assume_aligned(b, 16);

  for (i = 0; i < SIZE; i++)
    {
      x[i] += y[i];
    }
}

template void test1(float * __restrict__ a, float * __restrict__ b);
template void test1(double * __restrict__ a, double * __restrict__ b);
template void test1(int * __restrict__ a, int * __restrict__ b);
template void test1(char * __restrict__ a, char * __restrict__ b);




The generated assembly file (by GCC 4.9) will be:
 .text
 .align 4,0x90
 .globl void test1(float*, float*)
void test1(float*, float*):
LFB237:
 xorl %eax, %eax
 .align 4,0x90
L2:
 movaps (%rdi,%rax), %xmm0
 addps (%rsi,%rax), %xmm0
 movaps %xmm0, (%rdi,%rax)
 addq $16, %rax
 cmpq $262144, %rax
 jne L2
 rep; ret
LFE237:
 .align 4,0x90
 .globl void test1(double*, double*)
void test1(double*, double*):
LFB238:
 xorl %eax, %eax
 .align 4,0x90
L6:
 movapd (%rdi,%rax), %xmm0
 addpd (%rsi,%rax), %xmm0
 movapd %xmm0, (%rdi,%rax)
 addq $16, %rax
 cmpq $524288, %rax
 jne L6
 rep; ret
LFE238:
 .align 4,0x90
 .globl void test1(int*, int*)
void test1(int*, int*):
LFB239:
 xorl %eax, %eax
 .align 4,0x90
L9:
 movdqa (%rdi,%rax), %xmm0
 paddd (%rsi,%rax), %xmm0
 movdqa %xmm0, (%rdi,%rax)
 addq $16, %rax
 cmpq $262144, %rax
 jne L9
 rep; ret
LFE239:
 .align 4,0x90
 .globl void test1(char*, char*)
void test1(char*, char*):
LFB240:
 xorl %eax, %eax
 .align 4,0x90
L12:
 movdqa (%rsi,%rax), %xmm0
 paddb (%rdi,%rax), %xmm0
 movdqa %xmm0, (%rdi,%rax)
 addq $16, %rax
 cmpq $65536, %rax
 jne L12
 rep; ret
LFE240:
 .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
EH_frame1:
 .set L$set$0,LECIE1-LSCIE1
 .long L$set$0
LSCIE1:
 .long 0
 .byte 0x1
 .ascii "zR\0"
 .byte 0x1
 .byte 0x78
 .byte 0x10
 .byte 0x1
 .byte 0x10
 .byte 0xc
 .byte 0x7
 .byte 0x8
 .byte 0x90
 .byte 0x1
 .align 3
LECIE1:
LSFDE1:
 .set L$set$1,LEFDE1-LASFDE1
 .long L$set$1
LASFDE1:
 .long LASFDE1-EH_frame1
 .quad LFB237-.
 .set L$set$2,LFE237-LFB237
 .quad L$set$2
 .byte 0
 .align 3
LEFDE1:
LSFDE3:
 .set L$set$3,LEFDE3-LASFDE3
 .long L$set$3
LASFDE3:
 .long LASFDE3-EH_frame1
 .quad LFB238-.
 .set L$set$4,LFE238-LFB238
 .quad L$set$4
 .byte 0
 .align 3
LEFDE3:
LSFDE5:
 .set L$set$5,LEFDE5-LASFDE5
 .long L$set$5
LASFDE5:
 .long LASFDE5-EH_frame1
 .quad LFB239-.
 .set L$set$6,LFE239-LFB239
 .quad L$set$6
 .byte 0
 .align 3
LEFDE5:
LSFDE7:
 .set L$set$7,LEFDE7-LASFDE7
 .long L$set$7
LASFDE7:
 .long LASFDE7-EH_frame1
 .quad LFB240-.
 .set L$set$8,LFE240-LFB240
 .quad L$set$8
 .byte 0
 .align 3
LEFDE7:
 .constructor
 .destructor
 .align 1
 .subsections_via_symbols

No comments: