First, the speed of the template wrapper instantiation is going to be almost entirely at the mercy of your optimizer.
That said, I've reduced your samples to the most basic code I can think of, specifically to check the invoke of the function parameters. You can read on, but you'll see they invoke exactly the same. There is no benefit for one declaration vs another. Further, I included the one you left out, (reference-decl)
#include <cstdio>
int hello(int x)
{
return x;
}
template<typename Type>
void apply1(Type x, Type (f)(Type))
{
f(x);
}
template<typename Type>
void apply2(Type x, Type (*f)(Type))
{
f(x);
}
template<typename Type>
void apply3(Type x, Type (&f)(Type))
{
f(x);
}
int main(int argc, char *argv[])
{
apply1(1,hello);
apply2(2,hello);
apply3(3,hello);
return 0;
}
The actual asm generated from the deductions is:
apply1
__Z6apply1IiEvT_PFS0_S0_E:
Leh_func_begin2:
pushq %rbp
Ltmp2:
movq %rsp, %rbp
Ltmp3:
subq $16, %rsp
Ltmp4:
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movq -16(%rbp), %rax
movl -4(%rbp), %ecx
movl %ecx, %edi
callq *%rax
addq $16, %rsp
popq %rbp
ret
Leh_func_end2:
apply2
__Z6apply2IiEvT_PFS0_S0_E:
Leh_func_begin3:
pushq %rbp
Ltmp5:
movq %rsp, %rbp
Ltmp6:
subq $16, %rsp
Ltmp7:
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movq -16(%rbp), %rax
movl -4(%rbp), %ecx
movl %ecx, %edi
callq *%rax
addq $16, %rsp
popq %rbp
ret
Leh_func_end3:
apply3
__Z6apply3IiEvT_RFS0_S0_E:
Leh_func_begin4:
pushq %rbp
Ltmp8:
movq %rsp, %rbp
Ltmp9:
subq $16, %rsp
Ltmp10:
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movq -16(%rbp), %rax
movl -4(%rbp), %ecx
movl %ecx, %edi
callq *%rax
addq $16, %rsp
popq %rbp
ret
Leh_func_end4:
They are identical (as I suspected they would be). There is no difference that I can see whatsoever.
Note: it is worth mentioning the way the compiler saw these declarations by name mangling examination:
apply1: __Z6apply1IiEvT_PFS0_S0_E
apply2: __Z6apply2IiEvT_PFS0_S0_E
apply3: __Z6apply3IiEvT_RFS0_S0_E