This is the asm for the above code. Nothing clever; the main gain is 512 loops instead of 2048.
ft_memcmp:
08043b54: lsrs r2, r2, #2
364 while (count>0)
08043b56: cbz r2, 0x8043b78 <ft_memcmp+36>
356 {
08043b58: push {r4}
366 if (*a1 != *a2)
08043b5a: ldr r3, [r1, #0]
08043b5c: ldr r4, [r0, #0]
08043b5e: cmp r4, r3
08043b60: bne.n 0x8043b74 <ft_memcmp+32>
371 a1++;
08043b62: adds r0, #4
372 a2++;
08043b64: adds r1, #4
373 count--;
08043b66: subs r2, #1
364 while (count>0)
08043b68: cmp r2, #0
08043b6a: bne.n 0x8043b5a <ft_memcmp+6>
357 bool ret = true;
08043b6c: movs r0, #1
377 }
08043b6e: ldr.w r4, [sp], #4
08043b72: bx lr
368 ret=false;
08043b74: movs r0, #0
08043b76: b.n 0x8043b6e <ft_memcmp+26>
357 bool ret = true;
08043b78: movs r0, #1
377 }
08043b7a: bx lr
901 {