Here, I'll compile some things for AVR, using various C++ features, and show you what the overhead is.
First, a quick test to make sure there isn't any overhead in just switching to C++. Here's a test, toggling PA0:
#include <avr/io.h>
int main(void)
{
VPORT0.DIR = 0x01;
for (;;) {
VPORT0.OUT |= 0x01;
VPORT0.OUT &= ~0x01;
}
}
Same file saved as both test.c and test.cpp.
% avr-gcc -mmcu=atxmega32e5 -g -O1 test.c -o test.o
% avr-size -C --mcu=atxmega32e5 test.o
AVR Memory Usage
----------------
Device: atxmega32e5
Program: 210 bytes (0.6% Full)
(.text + .data + .bootloader)
Data: 0 bytes (0.0% Full)
(.data + .bss + .noinit)
% avr-gcc -mmcu=atxmega32e5 -g -O1 test.cpp -o test.o
% avr-size -C --mcu=atxmega32e5 test.o
AVR Memory Usage
----------------
Device: atxmega32e5
Program: 210 bytes (0.6% Full)
(.text + .data + .bootloader)
Data: 0 bytes (0.0% Full)
(.data + .bss + .noinit)
So no increase there.
Let's try some C++ features, shall we? Using a templated class to wrap a pin:
#include <avr/io.h>
#include <stdint.h>
#define VPORT(addr) ((VPORT_t*)(addr))
template<intptr_t vport, uint8_t npin>
class VPortPin
{
public:
void init(void) {
VPORT(vport)->DIR |= (1 << npin);
}
bool operator=(bool rhs) {
if (rhs) {
VPORT(vport)->OUT |= (1 << npin);
} else {
VPORT(vport)->OUT &= ~(1 << npin);
}
return rhs;
}
};
int main(void)
{
// Unfortunately gcc or C++ isn't smart enough to handle &(*(a))
// in a constant expression. 0x0010 is the address to VPORT0.
VPortPin<0x0010, 0> PA0;
PA0.init();
for (;;) {
PA0 = true;
PA0 = false;
}
}
% avr-gcc -mmcu=atxmega32e5 -g -O1 test.cpp -o test.o
% avr-size -C --mcu=atxmega32e5 test.o
AVR Memory Usage
----------------
Device: atxmega32e5
Program: 208 bytes (0.6% Full)
(.text + .data + .bootloader)
Data: 0 bytes (0.0% Full)
(.data + .bss + .noinit)
What? It's smaller?? But I thought C++ made everything larger! Let's take a look at the assembly:
% avr-objdump -S test.o
test.o: file format elf32-avr
Disassembly of section .text:
**SNIP VECTOR TABLE
**SNIP CONSTRUCTORS
000000c4 <main>:
template<intptr_t vport, uint8_t npin>
class VPortPin
{
public:
void init(void) {
VPORT(vport)->DIR |= (1 << npin);
c4: 80 9a sbi 0x10, 0 ; 16
}
bool operator=(bool rhs) {
if (rhs) {
VPORT(vport)->OUT |= (1 << npin);
c6: 88 9a sbi 0x11, 0 ; 17
} else {
VPORT(vport)->OUT &= ~(1 << npin);
c8: 88 98 cbi 0x11, 0 ; 17
ca: fd cf rjmp .-6 ; 0xc6 <main+0x2>
Yup. It was smart enough to boil down the templated class complete with method call and operator overload to single instructions, sbi (set bit in I/O) to set the pin, cbi (clear bit in I/O) to clear the pin, and rjmp (relative jump) to loop. The class instance takes no SRAM because it has no variables, and the conditional and return statement and functions calls themselves are removed because everything is known at compile time.
Anything else we could test? Ah - how about polymorphism? Let's make a couple subclasses of a virtual parent, and see how it deals with that.
#include <avr/io.h>
#include <stdint.h>
class A
{
public:
virtual void do_thing() = 0;
};
class B: public A
{
virtual void do_thing() {
PORTA.OUTTGL = 0x01;
}
};
class C: public A
{
virtual void do_thing() {
PORTA.OUTTGL = 0x80;
}
};
int main(void)
{
B b;
C c;
A *pa_b = &b;
A *pa_c = &c;
PORTA.DIR = 0x81;
for (;;) {
pa_b->do_thing();
pa_c->do_thing();
}
}
% avr-gcc -mmcu=atxmega32e5 -g -O1 test.cpp -o test.o
% avr-size -C --mmcu=atxmega32e5 test.o
AVR Memory Usage
----------------
Device: atxmega32e5
Program: 322 bytes (0.9% Full)
(.text + .data + .bootloader)
Data: 12 bytes (0.3% Full)
(.data + .bss + .noinit)
% avr-objdump -S test.o
test.o: file format elf32-avr
Disassembly of section .text:
**SNIP VECTOR TABLE
**SNIP CONSTRUCTORS
000000da <main>:
PORTA.OUTTGL = 0x80;
}
};
int main(void)
{
da: cf 93 push r28
dc: df 93 push r29
de: 00 d0 rcall .+0 ; 0xe0 <main+0x6>
e0: 00 d0 rcall .+0 ; 0xe2 <main+0x8>
e2: cd b7 in r28, 0x3d ; 61
e4: de b7 in r29, 0x3e ; 62
{
public:
virtual void do_thing() = 0;
};
class B: public A
e6: 84 e0 ldi r24, 0x04 ; 4
e8: 90 e2 ldi r25, 0x20 ; 32
ea: 8b 83 std Y+3, r24 ; 0x03
ec: 9c 83 std Y+4, r25 ; 0x04
virtual void do_thing() {
PORTA.OUTTGL = 0x01;
}
};
class C: public A
ee: 8a e0 ldi r24, 0x0A ; 10
f0: 90 e2 ldi r25, 0x20 ; 32
f2: 89 83 std Y+1, r24 ; 0x01
f4: 9a 83 std Y+2, r25 ; 0x02
B b;
C c;
A * const pa_b = &b;
A * const pa_c = &c;
PORTA.DIR = 0x81;
f6: 81 e8 ldi r24, 0x81 ; 129
f8: 80 93 00 06 sts 0x0600, r24
for (;;) {
pa_b->do_thing();
fc: eb 81 ldd r30, Y+3 ; 0x03
fe: fc 81 ldd r31, Y+4 ; 0x04
100: 01 90 ld r0, Z+
102: f0 81 ld r31, Z
104: e0 2d mov r30, r0
106: ce 01 movw r24, r28
108: 03 96 adiw r24, 0x03 ; 3
10a: 09 95 icall
pa_c->do_thing();
10c: e9 81 ldd r30, Y+1 ; 0x01
10e: fa 81 ldd r31, Y+2 ; 0x02
110: 01 90 ld r0, Z+
112: f0 81 ld r31, Z
114: e0 2d mov r30, r0
116: ce 01 movw r24, r28
118: 01 96 adiw r24, 0x01 ; 1
11a: 09 95 icall
11c: ef cf rjmp .-34 ; 0xfc <main+0x22>
Two or three instructions' worth of initialization that I think I could probably avoid. Two or three instructions isn't much. As for the indirect, polymorphic method calls, eight instructions: two to load the pointer, two to presumably get the method pointer from the vtable, three to prepare registers for the call (I used PORT_t.OUTTGL rather than VPORT_t.OUT here, that requires multiple instructions and spoils registers to access, being in the high section of memory), and the call itself. I'm unconvinced I could have done better.