First attempt, in c:

` GPIOA->BSRR = (1 << 3); // Set PA3`

int i;

uint16_t *org1 = adc1_buff;

uint16_t *org2 = adc2_buff;

uint64_t *dst = mac_buff;

uint32_t mul;

uint64_t mac = 0;

for(i=ADC_BUFF_SIZE/2; i; i--) {

mul = (*org1++) * (*org2++);

mac += mul;

*dst++ = mac;

}

GPIOA->BRR = (1 << 3); // Reset PA3

This code takes 138us to execute for a half buffer size = 1000 positions.

This is 138ns/sample compared to 188ns for ADC samples.

For a first attempt it is not bad at all.

Dissasembly:

`125 GPIOA->BSRR = (1 << 3); // Set PA3`

0800078a: mov.w r8, #1207959552 @ 0x48000000

131 uint64_t mac = 0;

0800078e: movs r2, #0

129 uint64_t *dst = mac_buff;

08000790: ldr r4, [pc, #84] @ (0x80007e8 <main+448>)

128 uint16_t *org2 = adc2_buff;

08000792: ldr r6, [pc, #88] @ (0x80007ec <main+452>)

127 uint16_t *org1 = adc1_buff;

08000794: ldr r0, [pc, #88] @ (0x80007f0 <main+456>)

125 GPIOA->BSRR = (1 << 3); // Set PA3

08000796: str.w r5, [r8, #24]

131 uint64_t mac = 0;

0800079a: mov r1, r2

134 mul = (*org1++) * (*org2++);

0800079c: ldrh.w r3, [r0], #2

080007a0: ldrh.w r12, [r6], #2

080007a4: mul.w r3, r12, r3

135 mac += mul;

080007a8: adds r2, r3, r2

136 *dst++ = mac;

080007aa: str.w r2, [r4], #8

135 mac += mul;

080007ae: adc.w r1, r1, r3, asr #31

133 for(i=ADC_BUFF_SIZE/2; i; i--) {

080007b2: cmp r0, r7

080007b4: str.w r1, [r4, #-4]

080007b8: bne.n 0x800079c <main+372>

138 GPIOA->BRR = (1 << 3); // Reset PA3

080007ba: str.w r5, [r8, #40] @ 0x28