I have switched to BAM (Binary Angle Modulation) or BCM (Binary Code Modulation). Now using MIBAM technique (mirrored BAM to eliminate flicker on some transitions). My old code just have no contest to it.
My MIBAM variant is much smaller in case of code memory and CPU time. It is way more stable, have no restrictions on minimal brightness and all channels' brightness.
Now I'm using 10-bit MIBAM with some optimizations and it works very well.

Using only one interrupt (16-bit timer overflow), simple interrupt routine, double buffering values.
Here is code:
// Maxim Krukov aka Fagear (2013)
// Code used on ATmega168 (ATmega88/ATmega328)
#define IO_LED_PORT PORTD // Output port for LEDs.
#define IO_LED_DIR DDRD
#define IO_LED1 (1<<7) // Channel G.
#define IO_LED2 (1<<6) // Channel B.
#define IO_LED3 (1<<5) // Channel R.
#define BAM_DATA_VALID (1<<0) // BAM values updated successfully.
#define BAM_MAX_STAGE 16 // BAM last stage number.
#define BAM_STAGE_COUNT BAM_MAX_STAGE+1 // BAM stage count.
#define COLOR_R 0
#define COLOR_G 1
#define COLOR_B 2
register volatile unsigned char uc_bam_stage asm("r2");
volatile unsigned char uc_bam_data_flags=0;
volatile unsigned int
ui_bam1_comp=0, // BAM channel 1 value for BAM processor (interrupts).
ui_bam2_comp=0, // BAM channel 2 value for BAM processor (interrupts).
ui_bam3_comp=0; // BAM channel 3 value for BAM processor (interrupts).
unsigned int
ui_bam_bit=0, // BAM bit numer in input value.
ui_bam1_in=0, // BAM channel 1 buffered value from BAM_data_convert();
ui_bam2_in=0, // BAM channel 2 buffered value from BAM_data_convert();
ui_bam3_in=0; // BAM channel 3 buffered value from BAM_data_convert();
// Lookup table for BAM stage -> bit mask (1<<bit).
const unsigned char tbl_bam_bitmask[BAM_STAGE_COUNT] = {9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 3, 4, 5, 6, 7, 8, 9};
// Lookup table for BAM stage -> period time.
const unsigned int tbl_bam_len[BAM_STAGE_COUNT] = {5120, 7680, 8960, 9600, 9920, 10080, 10160, 10240, 10280, 10300, 10380, 10540, 10860, 11500, 12780, 15340, 20460};
// Lookup table for BYTE -> BAM (brightness correction).
const unsigned int tbl_byte_to_bam[256] PROGMEM =
{
0, 0, 0, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 4, 4, 4, 4, 5,
5, 5, 6, 6, 6, 7, 7, 8,
8, 9, 9, 10, 10, 11, 11, 12,
12, 13, 14, 14, 15, 16, 16, 17,
18, 19, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
33, 34, 35, 36, 38, 39, 40, 41,
43, 44, 46, 47, 48, 50, 51, 53,
55, 56, 58, 59, 61, 63, 65, 66,
68, 70, 72, 74, 76, 78, 80, 82,
84, 86, 88, 90, 92, 95, 97, 99,
102, 104, 107, 109, 112, 114, 117, 119,
122, 125, 127, 130, 133, 136, 139, 142,
145, 148, 151, 154, 157, 160, 163, 167,
170, 173, 177, 180, 184, 187, 191, 194,
198, 202, 205, 209, 213, 217, 221, 225,
229, 233, 237, 241, 245, 250, 254, 258,
263, 267, 272, 276, 281, 286, 290, 295,
300, 305, 310, 315, 320, 325, 330, 335,
340, 345, 351, 356, 361, 367, 373, 378,
384, 389, 395, 401, 407, 413, 419, 425,
431, 437, 443, 450, 456, 462, 469, 475,
482, 488, 495, 502, 509, 516, 522, 529,
536, 544, 551, 558, 565, 573, 580, 587,
595, 603, 610, 618, 626, 634, 642, 650,
658, 666, 674, 682, 690, 699, 707, 716,
724, 733, 742, 750, 759, 768, 777, 786,
795, 805, 814, 823, 833, 842, 852, 861,
871, 881, 890, 900, 910, 920, 930, 941,
951, 961, 972, 982, 993, 1003, 1014, 1023
};
// Timer1 Compare Match B Handler.
// Software 3-channel BAM processing.
ISR(TIMER1_COMPB_vect)
{
ui_bam_bit=(1<<tbl_bam_bitmask[uc_bam_stage]);
if((ui_bam1_comp&ui_bam_bit)!=0)
IO_LED_PORT|=IO_LED1;
else
IO_LED_PORT&=~IO_LED1;
if((ui_bam2_comp&ui_bam_bit)!=0)
IO_LED_PORT|=IO_LED2;
else
IO_LED_PORT&=~IO_LED2;
if((ui_bam3_comp&ui_bam_bit)!=0)
IO_LED_PORT|=IO_LED3;
else
IO_LED_PORT&=~IO_LED3;
// Set next time trap.
OCR1B=tbl_bam_len[uc_bam_stage];
// Go to next stage.
uc_bam_stage++;
if(uc_bam_stage>BAM_MAX_STAGE)
{
uc_bam_stage=0;
// Check data validity.
if((uc_bam_data_flags&BAM_DATA_VALID)!=0)
{
// Pre-load new color values.
ui_bam1_comp=ui_bam1_in;
ui_bam2_comp=ui_bam2_in;
ui_bam3_comp=ui_bam3_in;
}
}
}
//-------------------------------------- Startup initialization.
inline void reset_init(void)
{
// Hardware init.
// Setup LED pins.
IO_LED_DIR|=IO_LED1|IO_LED2|IO_LED3;
// Start Timer1 (10-bit MIBAM timing): clk/8, clear on compare match (TOP=OCR1A).
TCCR1B=(1<<CS11)|(1<<WGM12);
// Set largest time offset as TOP.
OCR1A=tbl_bam_len[BAM_MAX_STAGE]+1;
// Set default OCR1B (prepare for bit0).
OCR1B=tbl_bam_len[BAM_MAX_STAGE];
// Enable Timer1 overflow interrupt.
TIMSK1=(1<<OCIE1B);
}
//-------------------------------------- Convert values for BAM.
void BAM_data_convert(void)
{
// Clear flag.
uc_bam_data_flags&=~BAM_DATA_VALID;
// Convert 8-bit values to 10-bit values (0...1023).
ui_bam1_in=pgm_read_word_near(tbl_byte_to_bam+uca_transit[COLOR_R]);
ui_bam2_in=pgm_read_word_near(tbl_byte_to_bam+uca_transit[COLOR_G]);
ui_bam3_in=pgm_read_word_near(tbl_byte_to_bam+uca_transit[COLOR_B]);
// Set flag.
uc_bam_data_flags|=BAM_DATA_VALID;
}
Input brightness values:
uca_transit[COLOR_R],
uca_transit[COLOR_G],
uca_transit[COLOR_B] - unsigned char (0...255).
BAM_data_convert() must be called after each update of
uca_transit or simply in cycle from
main().
This algorithm is universal and can be easily converted in 8-bit, 9-bit, 10-bit or whatever variant of BAM or MIBAM by changing define
BAM_MAX_STAGE and arrays
tbl_bam_bitmask[],
tbl_bam_len[]. Refresh rate and accuracy are also controlled by these values and clock of MCU.
I have an excel file with some scripts to recalc