Here's a snippet for the NCO and downconversion. By amalgamating the two, unrolling the loop by 8 and interleaving mutually exclusive operations, it means that memory accesses can be minimised. Note the use of .balign 4: this is to stop 32 bit instructions straddling words, which significantly affects performance. All floating point ops are 32 bit. Also note the use of carefully selected registers and adjacent store and load multiple to minimise cycles.
typedef float32_t SAMPLETYPE;
typedef float32_t COEFFTYPE;
typedef struct
{
SAMPLETYPE stI;
SAMPLETYPE stQ;
SAMPLETYPE stA;
SAMPLETYPE stB;
} NCOSTRUCT;
NCOSTRUCT _ncos;
COMPLEXSTRUCT _acsIn[NUM_SAMPLES];
COMPLEXSTRUCT _acsOut[NUM_SAMPLES*INT_FACTOR/DEC_FACTOR];
static void NCOInit(NCOSTRUCT *pncos,SAMPLETYPE stSampleRate,SAMPLETYPE stFreq)
{
pncos->stA=(SAMPLETYPE)cos(2.0*PI*stFreq/stSampleRate);
pncos->stB=(SAMPLETYPE)sin(2.0*PI*stFreq/stSampleRate);
pncos->stI=(SAMPLETYPE)1.0;
pncos->stQ=(SAMPLETYPE)0.0;
}
// How you'd do it if you didn't care about CPU, call this once for each sample pair to get the next NCO value.
__RAMFUNC(RAM2) static void NCONext(NCOSTRUCT *pncos,COMPLEXSTRUCT *pcs,int nNumSamples)
{
int i;
for (i=0;i<nNumSamples;i++)
{
SAMPLETYPE stTemp=pncos->stI;
pncos->stI = pncos->stA * stTemp - pncos->stB * pncos->stQ;
pncos->stQ = pncos->stB * stTemp + pncos->stA * pncos->stQ;
pcs[i].stI=pncos->stI;
pcs[i].stQ=pncos->stQ;
}
}
// This is the fast way to do it...
// Amalgamation of the NCO and downconvert function, unrolled by 8, call this every 4096 samples
__RAMFUNC(RAM2) static void DownConvert(NCOSTRUCT *pncos,uint32_t *pu32In,COMPLEXSTRUCT *pcsOut,int nNumSamples)
{
// This function prodcuces the quadrature NCO and complex mixes it with the RF (real) input
SAMPLETYPE stOffset=(SAMPLETYPE)1; // data comes in from ADC 0 to 4095, this is the offset used apply to move it to -1 to +1
SAMPLETYPE *pstOffset=&stOffset;
__asm__ __volatile__
(
"\n\t"
"vldm %[NCO],{s16-s19} \n\t" // Load NCO registers
"mov r6,%[NUMSAMPLES] \n\t"
"mov r5,r6,ASR #3 \n\t" // R5 is the division result for unrolling by 8
"and r6,r6,#7 \n\t" // R6 is the remainder, use to take the stragglers
"cbnz r5,loop1 \n\t"
"b loopexit1 \n\t"
".balign 4 \n\t"
"\nloop1: \n\t" // Do the unrolled version first
"vldm %[IN]!,{s24-s31} \n\t" // Load unsigned int samples
"vldm %[OFFSET],{s0} \n\t" // Reload 1.0 offset for 2's comp. conversion
"vcvt.f32.u32 s24,s24,11 \n\t" // Convert unsigned int to float
"vcvt.f32.u32 s25,s25,11 \n\t"
"vcvt.f32.u32 s26,s26,11 \n\t"
"vcvt.f32.u32 s27,s27,11 \n\t"
"vcvt.f32.u32 s28,s28,11 \n\t"
"vcvt.f32.u32 s29,s29,11 \n\t"
"vcvt.f32.u32 s30,s30,11 \n\t"
"vcvt.f32.u32 s31,s31,11 \n\t"
"vsub.f32 s24,s24,s0 \n\t" // Subtract 1 to make signed -1 to +1
"vsub.f32 s25,s25,s0 \n\t"
"vsub.f32 s26,s26,s0 \n\t"
"vsub.f32 s27,s27,s0 \n\t"
"vsub.f32 s28,s28,s0 \n\t"
"vsub.f32 s29,s29,s0 \n\t"
"vsub.f32 s30,s30,s0 \n\t"
"vsub.f32 s31,s31,s0 \n\t"
"vmul.f32 s20,s18,s16 \n\t" // 0.0 s20=A*I
"vmul.f32 s21,s19,s17 \n\t" // 0.1 s21=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 0.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 0.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 0.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 0.5 s17=B*I+A*Q
"vmul.f32 s0,s16,s24 \n\t" // 0.6 s0=I*In[0]
"vmul.f32 s1,s17,s24 \n\t" // 0.7 s1=Q*In[0]
"vmul.f32 s20,s18,s16 \n\t" // 1.0 s20=A*I
"vmul.f32 s21,s19,s17 \n\t" // 1.1 s21=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 1.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 1.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 1.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 1.5 s17=B*I+A*Q
"vmul.f32 s2,s16,s25 \n\t" // 1.6 s2=I*In[1]
"vmul.f32 s3,s17,s25 \n\t" // 1.7 s3=Q*In[1]
"vmul.f32 s20,s18,s16 \n\t" // 2.0 s22=A*I
"vmul.f32 s21,s19,s17 \n\t" // 2.1 s23=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 2.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 2.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 2.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 2.5 s17=B*I+A*Q
"vmul.f32 s4,s16,s26 \n\t" // 2.6 s4=I*In[2]
"vmul.f32 s5,s17,s26 \n\t" // 2.7 s5=Q*In[2]
"vmul.f32 s20,s18,s16 \n\t" // 3.0 s20=A*I
"vmul.f32 s21,s19,s17 \n\t" // 3.1 s21=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 3.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 3.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 3.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 3.5 s17=B*I+A*Q
"vmul.f32 s6,s16,s27 \n\t" // 3.6 s6=I*In[3]
"vmul.f32 s7,s17,s27 \n\t" // 3.7 s7=Q*In[3]
"vmul.f32 s20,s18,s16 \n\t" // 4.0 s20=A*I
"vmul.f32 s21,s19,s17 \n\t" // 4.1 s21=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 4.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 4.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 4.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 4.5 s17=B*I+A*Q
"vmul.f32 s8,s16,s28 \n\t" // 4.6 s8=I*In[4]
"vmul.f32 s9,s17,s28 \n\t" // 4.7 s9=Q*In[4]
"vmul.f32 s20,s18,s16 \n\t" // 5.0 s20=A*I
"vmul.f32 s21,s19,s17 \n\t" // 5.1 s21=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 5.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 5.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 5.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 5.5 s17=B*I+A*Q
"vmul.f32 s10,s16,s29 \n\t" // 5.6 s10=I*In[5]
"vmul.f32 s11,s17,s29 \n\t" // 5.7 s11=Q*In[5]
"vmul.f32 s20,s18,s16 \n\t" // 6.0 s20=A*I
"vmul.f32 s21,s19,s17 \n\t" // 6.1 s21=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 6.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 6.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 6.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 6.5 s17=B*I+A*Q
"vmul.f32 s12,s16,s30 \n\t" // 6.6 s12=I*In[6]
"vmul.f32 s13,s17,s30 \n\t" // 6.7 s13=Q*In[6]
"vmul.f32 s20,s18,s16 \n\t" // 7.0 s20=A*I
"vmul.f32 s21,s19,s17 \n\t" // 7.1 s21=B*Q
"vmul.f32 s22,s19,s16 \n\t" // 7.3 s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // 7.4 s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // 7.2 s16=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // 7.5 s17=B*I+A*Q
"vmul.f32 s14,s16,s31 \n\t" // 7.6 s14=I*In[7]
"vmul.f32 s15,s17,s31 \n\t" // 7.7 s15=Q*In[7]
"vstm %[OUT]!,{s0-s15} \n\t" // Store complex downconverted version
"subs r5,#1 \n\t"
"bne loop1 \n\t"
"\nloopexit1: \n\t"
"cbz r6,loopexit2 \n\t"
".balign 4 \n\t"
"vldm %[OFFSET],{s0} \n\t" // Reload 1.0 offset for 2's comp. conversion
"\nloop2: \n\t" // This is for the straggler samples remaining from the unrolled version
"vldm %[IN]!,{s24} \n\t" // Load next sample
"vmul.f32 s20,s18,s16 \n\t" // s20=A*I
"vcvt.f32.u32 s24,s24,11 \n\t" // Convert unsigned int to float
"vmul.f32 s21,s19,s17 \n\t" // s21=B*Q
"vsub.f32 s24,s24,s0 \n\t" // Subtract 1 to make signed -1 to +1
"vmul.f32 s22,s19,s16 \n\t" // s22=B*I
"vmul.f32 s23,s18,s17 \n\t" // s23=A*Q
"vsub.f32 s16,s20,s21 \n\t" // I=A*I-B*Q
"vadd.f32 s17,s22,s23 \n\t" // Q=B*I-A*Q
"vmul.f32 s14,s16,s24 \n\t" // s14=I*In[0]
"vmul.f32 s15,s17,s24 \n\t" // s15=Q*In[0]
"vstm %[OUT]!,{s14-s15} \n\t" // Store complex downconverted version
"subs r6,#1 \n\t"
"bne loop2 \n\t"
"\nloopexit2: \n\t"
"vstm %[NCO],{s16-s17} \n\t"
: [OUT]"+r" (pcsOut), [NUMSAMPLES]"+r" (nNumSamples), [IN]"+r" (pu32In), [OFFSET]"+r" (pstOffset)
: [NCO]"r" (pncos)
: "r5","r6",
"s0","s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9",
"s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19",
"s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29",
"s30","s31"
);
}
You also have to "trim" the NCO to stop it getting out of control over time, but this can be done occasionally and so has negligible performance impact.
// Fix the NCO in case it's running away.
// This is an approximation to G = 1/(SQRT(R^2+I^2)) =~ 0.5*(3-(R^2 + I^2))
stG=(SAMPLETYPE)0.5*((SAMPLETYPE)3.0-_ncos.stI*_ncos.stI-_ncos.stQ*_ncos.stQ);
_ncos.stI*=stG;
_ncos.stQ*=stG;
The downsampler:
#define NUM_SAMPLES 4096 // samples/block
#define NUM_COEFFS 1024 // number of FIR taps
#define INT_FACTOR 1 // Interpolations factor
#define DEC_FACTOR 128 // Decimation factor
#ifndef PI
#define PI 3.14159265358979
#endif
typedef float32_t SAMPLETYPE;
typedef float32_t COEFFTYPE;
#include "coeffs.h"
typedef struct
{
SAMPLETYPE stI;
SAMPLETYPE stQ;
} COMPLEXSTRUCT;
typedef struct
{
int nDecFactor;
int nIntFactor;
COEFFTYPE *pactCoeffsTransposed;
COMPLEXSTRUCT *pacsDelayLine;
COMPLEXSTRUCT *pcsDelayLineEnd;
int nPaddedCoeffCount;
int nCoeffsPerPhase;
int nCommutator;
int nXOffset;
} DOWNSAMPLESTRUCT;
static DOWNSAMPLESTRUCT _dss1;
// This is the optimised pinch point of the polyphase filter.
__RAMFUNC(RAM2)
static void DownSampleInner(COMPLEXSTRUCT *pcsIn,COMPLEXSTRUCT *pcsOut,COEFFTYPE **ppctCoeff,int nInCount)
{
COEFFTYPE *pctCoeff=*ppctCoeff;
__asm__ __volatile__
(
"\n\t"
"mov r5,%[INCOUNT],ASR #3 \n\t" //
"mov r6,%[INCOUNT] \n\t" // R6=number of full 8 unrolled iterations to do
"and r6,r6,#7 \n\t" // R6=remainder, ie any stragglers after unrolled loop
"vldm %[OUT],{s0-s1} \n\t" // Get the complex accumulator so far
"cbnz r5,loopRAI1 \n\t" // Check if enough samples for unroll by 8 version
"b loopexitRAI1 \n\t"
".balign 4 \n\t"
"\nloopRAI1: \n\t"
"vldm %[IN]!,{s16-s31} \n\t" // Load samples
"vldm %[COEFF]!,{s8-s15} \n\t" // Load coefficients
"vmul.f32 s2,s16,s8 \n\t" // s2=acsDelayLine[0].stI * *pctCoeff[0]
"vmul.f32 s3,s17,s8 \n\t" // s3=acsDelayLine[0].stQ * *pctCoeff[0]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s2+=s3
"vmul.f32 s2,s18,s9 \n\t" // s2=acsDelayLine[1].stI * *pctCoeff[1]
"vmul.f32 s3,s19,s9 \n\t" // s3=acsDelayLine[1].stQ * *pctCoeff[1]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s2+=s3
"vmul.f32 s2,s20,s10 \n\t" // s2=acsDelayLine[2].stI * *pctCoeff[2]
"vmul.f32 s3,s21,s10 \n\t" // s3=acsDelayLine[2].stQ * *pctCoeff[2]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s1+=s3
"vmul.f32 s2,s22,s11 \n\t" // s2=acsDelayLine[3].stI * *pctCoeff[3]
"vmul.f32 s3,s23,s11 \n\t" // s3=acsDelayLine[3].stQ * *pctCoeff[3]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s1+=s3
"vmul.f32 s2,s24,s12 \n\t" // s2=acsDelayLine[4].stI * *pctCoeff[4]
"vmul.f32 s3,s25,s12 \n\t" // s3=acsDelayLine[4].stQ * *pctCoeff[4]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s1+=s3
"vmul.f32 s2,s26,s13 \n\t" // s2=acsDelayLine[5].stI * *pctCoeff[5]
"vmul.f32 s3,s27,s13 \n\t" // s3=acsDelayLine[5].stQ * *pctCoeff[5]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s1+=s3
"vmul.f32 s2,s28,s14 \n\t" // s2=acsDelayLine[6].stI * *pctCoeff[6]
"vmul.f32 s3,s29,s14 \n\t" // s3=acsDelayLine[6].stQ * *pctCoeff[6]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s1+=s3
"vmul.f32 s2,s30,s15 \n\t" // s2=acsDelayLine[7].stI * *pctCoeff[7]
"vmul.f32 s3,s31,s15 \n\t" // s3=acsDelayLine[7].stQ * *pctCoeff[7]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s1+=s3
"subs r5,#1 \n\t"
"bne loopRAI1 \n\t"
"\nloopexitRAI1: \n\t"
"cbz r6,loopexitRAI2 \n\t"
".balign 4 \n\t"
"\nloopRAI2: \n\t" // Do the stragglers
"vldm %[IN]!,{s16-s17} \n\t" // Load sample
"vldm %[COEFF]!,{s8} \n\t" // Load coefficient
"vmul.f32 s2,s16,s8 \n\t" // s2=acsDelayLine[0].stI * *pctCoeff[0]
"vmul.f32 s3,s17,s8 \n\t" // s3=acsDelayLine[0].stQ * *pctCoeff[0]
"vadd.f32 s0,s0,s2 \n\t" // s0+=s2
"vadd.f32 s1,s1,s3 \n\t" // s0+=s2
"subs r6,#1 \n\t"
"bne loopRAI2 \n\t"
"\nloopexitRAI2: \n\t"
"vstm %[OUT],{s0-s1} \n\t" // Write out the complex accumulator
: [IN]"+r" (pcsIn), [COEFF]"+r" (pctCoeff)
: [INCOUNT]"r" (nInCount), [OUT]"r" (pcsOut)
: "r5","r6",
"s0","s1",
"s8","s9","s10","s11","s12","s13","s14","s15",
"s16","s17","s18","s19","s20","s21","s22","s23",
"s24","s25","s26","s27","s28","s29","s30","s31"
);
*ppctCoeff=pctCoeff; // Write back the pointer
}
// This is the polyphase down sampler
__RAMFUNC(RAM2)
static int DownSample(DOWNSAMPLESTRUCT *pdss,COMPLEXSTRUCT *pcsIn,int nInCount,COMPLEXSTRUCT *pcsOut,int nOutCount)
{
COMPLEXSTRUCT *pcsX=pcsIn+pdss->nXOffset;
COMPLEXSTRUCT *pcsY=pcsOut;
COMPLEXSTRUCT *pcsEnd=pcsIn+nInCount;
while (pcsX<pcsEnd)
{
COMPLEXSTRUCT csAcc={(SAMPLETYPE)0,(SAMPLETYPE)0};
COEFFTYPE *pctCoeff=pdss->pactCoeffsTransposed+pdss->nCommutator*pdss->nCoeffsPerPhase;
register COMPLEXSTRUCT *pcsX2=pcsX-pdss->nCoeffsPerPhase+1;
int nOffset=pcsIn-pcsX2;
int nAdvanceAmount;
int nCount;
if (nOffset>0)
{
// Need to draw from the delay line
COMPLEXSTRUCT *pcsDelayLine=pdss->pcsDelayLineEnd-nOffset;
nCount=pdss->pcsDelayLineEnd-pcsDelayLine;
DownSampleInner(pcsDelayLine,&csAcc,&pctCoeff,nCount);
pcsX2+=nOffset;
}
nCount=pcsX-pcsX2+1;
DownSampleInner(pcsX2,&csAcc,&pctCoeff,nCount);
*pcsY++=csAcc;
pdss->nCommutator+=pdss->nDecFactor;
nAdvanceAmount=pdss->nCommutator/pdss->nIntFactor;
pcsX+=nAdvanceAmount;
pdss->nCommutator%=pdss->nIntFactor;
}
pdss->nXOffset=pcsX-pcsEnd;
// Manage DelayLine
{
int nRetain=(pdss->nCoeffsPerPhase-1)-nInCount;
if (nRetain>0)
{
memmove(pdss->pacsDelayLine,pcsEnd-nRetain,sizeof(COMPLEXSTRUCT)*nRetain);
memmove(pdss->pcsDelayLineEnd-nInCount,pcsIn,sizeof(COMPLEXSTRUCT)*(pcsEnd-pcsIn));
}
else
{
memmove(pdss->pacsDelayLine,pcsEnd-(pdss->nCoeffsPerPhase-1),sizeof(COMPLEXSTRUCT)*(pdss->nCoeffsPerPhase-1));
}
}
return pcsY-pcsOut;
}