I'm investigating memcpy speedups and am going to base my code on modifications to output from the compiler (as a baseline), I'm reading the output of this specific function:
void* memcpy_basic(void* s1, const void* s2, size_t sz)
{
for (size_t i = 0; i < sz; i++)
{
((char*)s1)[i] = ((const char*)s2)[i];
}
return s1;
}
The assembly produced is:
[global _memcpy_basic]
; memcpy_basic
_memcpy_basic:
push ebx
push esi
push edi
L_1:
mov esi,dword [esp+0ch+0ch]
mov edx,dword [esp+08h+0ch]
mov eax,dword [esp+04h+0ch]
; Line 14: for (size_t i = 0; i < sz; i++)
L_10:
xor ebx,ebx
cmp ebx,esi
jnc L_6
L_4:
; Line 15: {
; Line 16: ((char*)s1)[i] = ((const char*)s2)[i];
mov edi,ebx
mov cl,byte [edx+ebx]
mov byte [eax+ebx],cl
; Line 17: }
L_7:
mov ebx,edi
inc ebx
L_5:
cmp ebx,esi
jc L_4
L_6:
; Line 18: return s1;
; Line 19: }
L_2:
pop edi
pop esi
pop ebx
ret
For some reason every loop iteration ebx needs to be saved against edi, then restored, but ebx is never used in such a way to warrant this.
Something with the loop optimizer isn't inspecting between the two blocks of L_4 and L_7 and noticing that EBX is only incremented and not used, so the extra movs are wasted.
I don't know where that is coming up, but when I dump the icds:
; Unoptimized
L_4:
DBG Block START
; Line 15: {
DBG Block START
; Line 16: ((char*)s1)[i] = ((const char*)s2)[i];
T3.A = _s2:LINK(8).A
T2.UI = _i:LINK(0).UI
T4.I = T2.UI
T5.A = T3.A + T4.I
T6.C = *(T5).A.C
T7.A = _s1:LINK(4).A
T8.A = T7.A + T4.I
*(T8).A.C = T6.C
; Line 17: }
DBG Block END
DBG Block END
BLOCK 5
L_7:
T2.UI = _i:LINK(0).UI
T9.UI = T2.UI + #1.UI
_i:LINK(0).UI = T9.UI
DBG Block END
BLOCK 6
L_5:
T1.UI = _sz:LINK(12).UI
T2.UI = _i:LINK(0).UI
CONDGO L_4:PC ; T2.UI U< T1.UI
BLOCK 7
L_4:
DBG Block START
; Line 15: {
DBG Block START
; Line 16: ((char*)s1)[i] = ((const char*)s2)[i];
T10(EDI).UI = T12(EBX).UI
T9(CL).C = *(T2(EDX) + T12(EBX)).A.C
*(T7(EAX) + T12(EBX)).A.C = T9(CL).C
; Line 17: }
DBG Block END
DBG Block END
BLOCK END
BLOCK 5
L_7:
T12(EBX).UI = T10(EDI).UI + #1.UI
DBG Block END
BLOCK END
BLOCK 6
L_5:
T13(EDI).UI = T1(ESI).UI
T14(ECX).UI = T12(EBX).UI
CONDGO L_4:PC ; T14(ECX).UI U< T13(EDI).UI
BLOCK END
BLOCK 7
To me, this looks like a redundancy pass wasn't had on that assignment and reassignment.
I have very little idea about the optimizer internals here, so I don't think I'd be too much of help outside of this identification of "What's going on over here?"
I'm investigating memcpy speedups and am going to base my code on modifications to output from the compiler (as a baseline), I'm reading the output of this specific function:
The assembly produced is:
For some reason every loop iteration ebx needs to be saved against edi, then restored, but ebx is never used in such a way to warrant this.
Something with the loop optimizer isn't inspecting between the two blocks of L_4 and L_7 and noticing that EBX is only incremented and not used, so the extra
movs are wasted.I don't know where that is coming up, but when I dump the icds:
To me, this looks like a redundancy pass wasn't had on that assignment and reassignment.
I have very little idea about the optimizer internals here, so I don't think I'd be too much of help outside of this identification of "What's going on over here?"