First attempt, in c:
GPIOA->BSRR = (1 << 3); // Set PA3
int i;
uint16_t *org1 = adc1_buff;
uint16_t *org2 = adc2_buff;
uint64_t *dst = mac_buff;
uint32_t mul;
uint64_t mac = 0;
for(i=ADC_BUFF_SIZE/2; i; i--) {
mul = (*org1++) * (*org2++);
mac += mul;
*dst++ = mac;
}
GPIOA->BRR = (1 << 3); // Reset PA3
This code takes 138us to execute for a half buffer size = 1000 positions.
This is 138ns/sample compared to 188ns for ADC samples.
For a first attempt it is not bad at all.
Dissasembly:
125 GPIOA->BSRR = (1 << 3); // Set PA3
0800078a: mov.w r8, #1207959552 @ 0x48000000
131 uint64_t mac = 0;
0800078e: movs r2, #0
129 uint64_t *dst = mac_buff;
08000790: ldr r4, [pc, #84] @ (0x80007e8 <main+448>)
128 uint16_t *org2 = adc2_buff;
08000792: ldr r6, [pc, #88] @ (0x80007ec <main+452>)
127 uint16_t *org1 = adc1_buff;
08000794: ldr r0, [pc, #88] @ (0x80007f0 <main+456>)
125 GPIOA->BSRR = (1 << 3); // Set PA3
08000796: str.w r5, [r8, #24]
131 uint64_t mac = 0;
0800079a: mov r1, r2
134 mul = (*org1++) * (*org2++);
0800079c: ldrh.w r3, [r0], #2
080007a0: ldrh.w r12, [r6], #2
080007a4: mul.w r3, r12, r3
135 mac += mul;
080007a8: adds r2, r3, r2
136 *dst++ = mac;
080007aa: str.w r2, [r4], #8
135 mac += mul;
080007ae: adc.w r1, r1, r3, asr #31
133 for(i=ADC_BUFF_SIZE/2; i; i--) {
080007b2: cmp r0, r7
080007b4: str.w r1, [r4, #-4]
080007b8: bne.n 0x800079c <main+372>
138 GPIOA->BRR = (1 << 3); // Reset PA3
080007ba: str.w r5, [r8, #40] @ 0x28