This is the asm for the above code. Nothing clever; the main gain is 512 loops instead of 2048.

` ft_memcmp:`

08043b54: lsrs r2, r2, #2

364 while (count>0)

08043b56: cbz r2, 0x8043b78 <ft_memcmp+36>

356 {

08043b58: push {r4}

366 if (*a1 != *a2)

08043b5a: ldr r3, [r1, #0]

08043b5c: ldr r4, [r0, #0]

08043b5e: cmp r4, r3

08043b60: bne.n 0x8043b74 <ft_memcmp+32>

371 a1++;

08043b62: adds r0, #4

372 a2++;

08043b64: adds r1, #4

373 count--;

08043b66: subs r2, #1

364 while (count>0)

08043b68: cmp r2, #0

08043b6a: bne.n 0x8043b5a <ft_memcmp+6>

357 bool ret = true;

08043b6c: movs r0, #1

377 }

08043b6e: ldr.w r4, [sp], #4

08043b72: bx lr

368 ret=false;

08043b74: movs r0, #0

08043b76: b.n 0x8043b6e <ft_memcmp+26>

357 bool ret = true;

08043b78: movs r0, #1

377 }

08043b7a: bx lr

901 {