Ineffecient codegen for loops

I'm investigating memcpy speedups and am going to base my code on modifications to output from the compiler (as a baseline), I'm reading the output of this specific function:

```c
void* memcpy_basic(void* s1, const void* s2, size_t sz)
{
    for (size_t i = 0; i < sz; i++)
    {
        ((char*)s1)[i] = ((const char*)s2)[i];
    }
    return s1;
}
```
The assembly produced is:
```asm
[global _memcpy_basic]
; memcpy_basic
_memcpy_basic:
        push    ebx
        push    esi
        push    edi
L_1:
        mov     esi,dword [esp+0ch+0ch]
        mov     edx,dword [esp+08h+0ch]
        mov     eax,dword [esp+04h+0ch]
; Line 14:     for (size_t i = 0; i < sz; i++) 
L_10:
        xor     ebx,ebx
        cmp     ebx,esi
        jnc     L_6
L_4:
; Line 15:     { 
; Line 16:         ((char*)s1)[i] = ((const char*)s2)[i]; 
        mov     edi,ebx
        mov     cl,byte [edx+ebx]
        mov     byte [eax+ebx],cl
; Line 17:     } 
L_7:
        mov     ebx,edi
        inc     ebx
L_5:
        cmp     ebx,esi
        jc      L_4
L_6:
; Line 18:     return s1; 
; Line 19: } 
L_2:
        pop     edi
        pop     esi
        pop     ebx
        ret
```

For some reason every loop iteration ebx needs to be saved against edi, then restored, but ebx is never used in such a way to warrant this.
Something with the loop optimizer isn't inspecting between the two blocks of L_4 and L_7 and noticing that EBX is only incremented and not used, so the extra `mov`s are wasted.

I don't know where that is coming up, but when I dump the icds:

```
; Unoptimized
L_4:
	DBG Block START
; Line 15:	    { 

	DBG Block START
; Line 16:	        ((char*)s1)[i] = ((const char*)s2)[i]; 

	T3.A =   _s2:LINK(8).A
	T2.UI =   _i:LINK(0).UI
	T4.I =   T2.UI
	T5.A = T3.A + T4.I
	T6.C =   *(T5).A.C
	T7.A =   _s1:LINK(4).A
	T8.A = T7.A + T4.I
	*(T8).A.C =   T6.C
; Line 17:	    } 

	DBG Block END
	DBG Block END
	BLOCK 5

L_7:
	T2.UI =   _i:LINK(0).UI
	T9.UI = T2.UI + #1.UI
	_i:LINK(0).UI =   T9.UI
	DBG Block END
	BLOCK 6

L_5:
	T1.UI =   _sz:LINK(12).UI
	T2.UI =   _i:LINK(0).UI
	CONDGO	L_4:PC ; T2.UI U< T1.UI
	BLOCK 7
```
```
L_4:
	DBG Block START
; Line 15:	    { 

	DBG Block START
; Line 16:	        ((char*)s1)[i] = ((const char*)s2)[i]; 

	T10(EDI).UI =   T12(EBX).UI
	T9(CL).C =   *(T2(EDX) + T12(EBX)).A.C
	*(T7(EAX) + T12(EBX)).A.C =   T9(CL).C
; Line 17:	    } 

	DBG Block END
	DBG Block END
	BLOCK END
	BLOCK 5

L_7:
	T12(EBX).UI = T10(EDI).UI + #1.UI
	DBG Block END
	BLOCK END
	BLOCK 6
L_5:
	T13(EDI).UI =   T1(ESI).UI
	T14(ECX).UI =   T12(EBX).UI
	CONDGO	L_4:PC ; T14(ECX).UI U< T13(EDI).UI
	BLOCK END
	BLOCK 7
```
To me, this looks like a redundancy pass wasn't had on that assignment and reassignment.
I have very little idea about the optimizer internals here, so I don't think I'd be too much of help outside of this identification of "What's going on over here?"


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Ineffecient codegen for loops #1109

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Ineffecient codegen for loops #1109

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions