I'm trying to be educational.
Just to prove an earlier point, here's what you get with memset, compiled with -O1. Still sure you can do better?
#include <string.h>
#include <stdint.h>
typedef void(*renderfunc)(void);
extern renderfunc pFun;
void otherfunc(void);
extern union {
uint8_t b[1024];
uint32_t i[256];
} a;
extern int loop_index;
void clear(void)
{
memset(&a.i[loop_index], 0, 32);
loop_index += 8;
if (loop_index == 256)
pFun = otherfunc;
}
00000000 <clear>:
0: 8f830000 lw v1,0(gp)
4: 00032080 sll a0,v1,0x2
8: 3c020000 lui v0,0x0
c: 24420000 addiu v0,v0,0
10: 00441021 addu v0,v0,a0
14: ac400000 sw zero,0(v0)
18: ac400004 sw zero,4(v0)
1c: ac400008 sw zero,8(v0)
20: ac40000c sw zero,12(v0)
24: ac400010 sw zero,16(v0)
28: ac400014 sw zero,20(v0)
2c: ac400018 sw zero,24(v0)
30: ac40001c sw zero,28(v0)
34: 24620008 addiu v0,v1,8
38: 24030100 li v1,256
3c: 14430004 bne v0,v1,50 <clear+0x50>
40: af820000 sw v0,0(gp)
44: 3c020000 lui v0,0x0
48: 24420000 addiu v0,v0,0
4c: af820000 sw v0,0(gp)
50: 03e00008 jr ra
54: 00000000 nop