Hi,
I am developing a FM RDS encoder based on ATMega8 (RDS stream generator) and ATTiny2313 (RDS biphase modulator).
To generate the biphase signal modulated on 57kHz I have stored a lookup table in FLASH which holds the precalculated waveform which is output using a 8bit R2R D/A connected on PORTB.
I have setup a timer which fires a interrupt with a interval of 8.77uS. The interrupt takes a value from the LUT and outputs it on PORTB:
const uint8_t ubTable[48] PROGMEM = {
{ //1-1-1
0x82,0x91,0xA1,0xB1,0xC0,0xCE,0xDA,0xE5,
0xEE,0xF6,0xFB,0xFE,0xFF,0xFE,0xFB,0xF5,
0xEE,0xE5,0xDA,0xCD,0xBF,0xB0,0xA1,0x90,
0x80,0x6F,0x5F,0x4F,0x40,0x32,0x26,0x1B,
0x12,0x0A,0x05,0x02,0x01,0x02,0x05,0x0B,
0x12,0x1C,0x27,0x33,0x41,0x51,0x61,0x72
}
volatile uint8_t ubWavePosition = 0;
ISR (TIMER1_OVF_vect)
{
PORTB = pgm_read_byte(&ubTable[ubWavePosition++]);
if (ubWavePosition == 48) ubWavePosition = 0;
}
Reading a value from flash like this is not fast enough and using inline assembly might improve speed. Unfortunately I`m not into AVR assembly.
Can someone point me to the right direction using inline assembly to output the waveform?
Thank you.
At 20 MHz, each cycle takes 0.05µs, so 8.77µs is about 175 cycles. On ATtiny2313, the PORTB update including the lookup takes something like 15 to 20 cycles. Interrupt overhead is about 20-30 cycles in practice (depending on how many registers need to be pushed to the stack), so using an ISR might or might not work.
For reference, the following code:
#define __AVR_ATtiny2313__
#include <avr/io.h>
#include <avr/interrupt.h>
#include <avr/pgmspace.h>
static const unsigned char table[48] = {
0x82, 0x91, 0xA1, 0xB1, 0xC0, 0xCE, 0xDA, 0xE5,
0xEE, 0xF6, 0xFB, 0xFE, 0xFF, 0xFE, 0xFB, 0xF5,
0xEE, 0xE5, 0xDA, 0xCD, 0xBF, 0xB0, 0xA1, 0x90,
0x80, 0x6F, 0x5F, 0x4F, 0x40, 0x32, 0x26, 0x1B,
0x12, 0x0A, 0x05, 0x02, 0x01, 0x02, 0x05, 0x0B,
0x12, 0x1C, 0x27, 0x33, 0x41, 0x51, 0x61, 0x72
};
static unsigned char timer1_index = 0;
ISR (TIMER1_OVF_vect)
{
PORTB = pgm_read_byte(table + timer1_index);
timer1_index++;
if (timer1_index >= sizeof table / sizeof table[0])
timer1_index = 0;
}
using avr-gcc-4.9.2 -mmcu=avr25 -O2 compiles the ISR into
__vector_5:
push r1 ; Interrupt prologue
push r0 ;
in r0, __SREG__ ;
push r0 ;
clr __zero_reg__ ;
push r24 ;
push r30 ;
push r31 ;
lds r30, timer1_index ; Load table+timer1_index
ldi r31, 0 ; into Z (r31*256+r30)
subi r30, lo8(-(table)) ;
sbci r31, hi8(-(table)) ;
lpm r30, Z ; Load flash *Z into r30
out 0x18, r30 ; PORTB = r30
lds r24, timer1_index ; Update timer1_index
subi r24, lo8(-(1)) ;
cpi r24, lo8(48) ;
brlo .L10 ; If not over, jump to .L10
sts timer1_index,__zero_reg__ ; Clear timer1_index to zero
pop r31 ; Interrupt epilogue
pop r30 ;
pop r24 ;
pop r0 ;
out __SREG__,r0 ;
pop r0 ;
pop r1 ;
reti ; Return from interrupt
.L10:
sts timer1_index, r24 ; Save updated timer1_index.
pop r31 ; Interrupt epilogue
pop r30 ;
pop r24 ;
pop r0 ;
out __SREG__,r0 ;
pop r0 ;
pop r1 ;
reti ; Return from interrupt
You do not need to know assembly to see that there isn't much there you can avoid. On ATtiny2313, LPM takes 3 cycles, so moving the array to RAM would save you just 1 cycle per sample.
So, if you are having difficulty with that code not being fast enough, it is not because there is something wrong with the C code; it is just that there is too much overhead in using an interrupt in the first place.
Let's look at the same code, but written as a loop, with a 8µs delay:
#define __AVR_ATtiny2313__
#define F_CPU 20000000
#include <avr/io.h>
#include <avr/pgmspace.h>
#include <util/delay.h>
static const unsigned char table[48] = {
0x82, 0x91, 0xA1, 0xB1, 0xC0, 0xCE, 0xDA, 0xE5,
0xEE, 0xF6, 0xFB, 0xFE, 0xFF, 0xFE, 0xFB, 0xF5,
0xEE, 0xE5, 0xDA, 0xCD, 0xBF, 0xB0, 0xA1, 0x90,
0x80, 0x6F, 0x5F, 0x4F, 0x40, 0x32, 0x26, 0x1B,
0x12, 0x0A, 0x05, 0x02, 0x01, 0x02, 0x05, 0x0B,
0x12, 0x1C, 0x27, 0x33, 0x41, 0x51, 0x61, 0x72
};
void loop(void)
{
unsigned char index = 0;
while (1) {
PORTB = pgm_read_byte(table + index);
if (++index >= sizeof table / sizeof table[0])
index = 0;
_delay_us(8);
}
}
which generates
loop:
ldi r24, 0 ; r24 = index
.L3:
mov r30, r24 ; r31*256+r30 = Z = table+index
ldi r31, 0
subi r30, lo8(-(table))
sbci r31, hi8(-(table))
lpm r30, Z ; r30 = *Z from progmem
out 0x18, r30 ; PORTB = r30
subi r24, lo8(-(1)) ; index++
cpi r24, lo8(48) ; if index is too high,
brlo .L2 ; jump to .L2
ldi r24, 0 ; otherwise clear index
.L2:
ldi r25, lo8(53) ; _delay_us(8);
1: ;
dec r25 ;
brne 1b ;
nop ;
rjmp .L3
i.e., the _delay_us(8) simplifies to a loop that simply decrements a value (from 53 to 0), followed by a single no-operation.
If you have interrupts enabled when this code is running, you'll see occasional glitches in the output (if using e.g. an oscilloscope). This is because interrupts will interrupt the loop, and do other stuff while samples should be output.