QuoteI wish the ISRs in C code [on ARM] that the HW interrupt entry was quicker...Doesn't the 'naked' attribute of the function definition remove the prolog and epilog?
Pretty much everyone using ARC or Xtensa is likely to switch to RISC-V
[complaints about CM0 code]I guess there are two options: 1) let the C compiler figure
You could try the LoFive: https://store.groupgets.com/products/lofive-risc-v
My surprises show up when initializing periperals. I expected code like:Code: [Select]PORT->Group[0].PINCFG[12].reg |= PORT_PINCFG_DRVSTR;
PORT->Group[0].DIRSET.reg |= 1<<12;
#include <stdint.h>
#define PORT_PINCFG_DRVSTR (1<<7)
struct {
struct {
struct {
uint32_t foo;
uint32_t reg;
uint32_t bar;
} PINCFG[16];
struct {
uint64_t baz;
uint32_t reg;
} DIRSET;
} Group[10];
} *PORT = (void*)0xdecaf000;
void main(){
PORT->Group[0].PINCFG[12].reg |= PORT_PINCFG_DRVSTR;
PORT->Group[0].DIRSET.reg |= 1<<12;
}
arm-linux-gnueabihf-gcc -O initPorts.c -o initPorts -nostartfiles && \
arm-linux-gnueabihf-objdump -D initPorts | expand | less -p'<main>'
000001c0 <main>:
1c0: 4b07 ldr r3, [pc, #28] ; (1e0 <main+0x20>)
1c2: 447b add r3, pc
1c4: 681b ldr r3, [r3, #0]
1c6: f8d3 2094 ldr.w r2, [r3, #148] ; 0x94
1ca: f042 0280 orr.w r2, r2, #128 ; 0x80
1ce: f8c3 2094 str.w r2, [r3, #148] ; 0x94
1d2: f8d3 20c8 ldr.w r2, [r3, #200] ; 0xc8
1d6: f442 5280 orr.w r2, r2, #4096 ; 0x1000
1da: f8c3 20c8 str.w r2, [r3, #200] ; 0xc8
1de: 4770 bx lr
1e0: 00010e3a andeq r0, r1, sl, lsr lr
00011000 <PORT>:
11000: decaf000 cdple 0, 12, cr15, cr10, cr0, {0}
000001c0 <main>:
1c0: e59f3020 ldr r3, [pc, #32] ; 1e8 <main+0x28>
1c4: e08f3003 add r3, pc, r3
1c8: e5933000 ldr r3, [r3]
1cc: e5932094 ldr r2, [r3, #148] ; 0x94
1d0: e3822080 orr r2, r2, #128 ; 0x80
1d4: e5832094 str r2, [r3, #148] ; 0x94
1d8: e59320c8 ldr r2, [r3, #200] ; 0xc8
1dc: e3822a01 orr r2, r2, #4096 ; 0x1000
1e0: e58320c8 str r2, [r3, #200] ; 0xc8
1e4: e12fff1e bx lr
1e8: 00010e34 andeq r0, r1, r4, lsr lr
00011000 <PORT>:
11000: decaf000 cdple 0, 12, cr15, cr10, cr0, {0}
/code]
Thumb1:
[code]
000001c0 <main>:
1c0: 4b07 ldr r3, [pc, #28] ; (1e0 <main+0x20>)
1c2: 447b add r3, pc
1c4: 681b ldr r3, [r3, #0]
1c6: 2194 movs r1, #148 ; 0x94
1c8: 2280 movs r2, #128 ; 0x80
1ca: 5858 ldr r0, [r3, r1]
1cc: 4302 orrs r2, r0
1ce: 505a str r2, [r3, r1]
1d0: 3134 adds r1, #52 ; 0x34
1d2: 2280 movs r2, #128 ; 0x80
1d4: 0152 lsls r2, r2, #5
1d6: 5858 ldr r0, [r3, r1]
1d8: 4302 orrs r2, r0
1da: 505a str r2, [r3, r1]
1dc: 4770 bx lr
1de: 46c0 nop ; (mov r8, r8)
1e0: 00010e3a andeq r0, r1, sl, lsr lr
00011000 <PORT>:
11000: decaf000 cdple 0, 12, cr15, cr10, cr0, {0}
00000000000002ac <main>:
2ac: b0000080 adrp x0, 11000 <PORT>
2b0: f9400000 ldr x0, [x0]
2b4: b9409401 ldr w1, [x0, #148]
2b8: 32190021 orr w1, w1, #0x80
2bc: b9009401 str w1, [x0, #148]
2c0: b940c801 ldr w1, [x0, #200]
2c4: 32140021 orr w1, w1, #0x1000
2c8: b900c801 str w1, [x0, #200]
2cc: d65f03c0 ret
0000000000011000 <PORT>:
11000: decaf000 .word 0xdecaf000
11004: 00000000 .word 0x00000000
000001c0 <main>:
1c0: 4b07 ldr r3, [pc, #28] ; (1e0 <main+0x20>)
1c2: 447b add r3, pc
1c4: 681b ldr r3, [r3, #0]
1c6: 2194 movs r1, #148 ; 0x94
1c8: 2280 movs r2, #128 ; 0x80
1ca: 5858 ldr r0, [r3, r1]
1cc: 4302 orrs r2, r0
1ce: 505a str r2, [r3, r1]
1d0: 3134 adds r1, #52 ; 0x34
1d2: 2280 movs r2, #128 ; 0x80
1d4: 0152 lsls r2, r2, #5
1d6: 5858 ldr r0, [r3, r1]
1d8: 4302 orrs r2, r0
1da: 505a str r2, [r3, r1]
1dc: 4770 bx lr
1de: 46c0 nop ; (mov r8, r8)
1e0: 00010e3a andeq r0, r1, sl, lsr lr
00011000 <PORT>:
11000: decaf000 cdple 0, 12, cr15, cr10, cr0, {0}
00010074 <main>:
10074: 67c5 lui a5,0x11
10076: 0947a783 lw a5,148(a5) # 11094 <PORT>
1007a: 6685 lui a3,0x1
1007c: 0947a703 lw a4,148(a5)
10080: 08076713 ori a4,a4,128
10084: 08e7aa23 sw a4,148(a5)
10088: 0c87a703 lw a4,200(a5)
1008c: 8f55 or a4,a4,a3
1008e: 0ce7a423 sw a4,200(a5)
10092: 8082 ret
00011094 <PORT>:
11094: f000 fsw fs0,32(s0)
11096: deca sw s2,124(sp)
800001ac <main>:
800001ac: 2079 8000 400c moveal 8000400c <PORT>,%a0
800001b2: 0068 0080 0096 oriw #128,%a0@(150)
800001b8: 0068 1000 00ca oriw #4096,%a0@(202)
800001be: 4e75 rts
8000400c <PORT>:
8000400c: deca addaw %a2,%sp
8000400e: f000
000001b5 <main>:
1b5: e8 20 00 00 00 call 1da <__x86.get_pc_thunk.ax>
1ba: 05 3a 1e 00 00 add $0x1e3a,%eax
1bf: 8b 80 0c 00 00 00 mov 0xc(%eax),%eax
1c5: 81 88 94 00 00 00 80 orl $0x80,0x94(%eax)
1cc: 00 00 00
1cf: 81 88 c8 00 00 00 00 orl $0x1000,0xc8(%eax)
1d6: 10 00 00
1d9: c3 ret
000001da <__x86.get_pc_thunk.ax>:
1da: 8b 04 24 mov (%esp),%eax
1dd: c3 ret
00002000 <PORT>:
2000: 00 f0 add %dh,%al
2002: ca .byte 0xca
2003: de .byte 0xde
004001b0 <main>:
4001b0: 07 d1 mov.l 4001d0 <main+0x20>,r1 ! 411000 <PORT>
4001b2: 12 61 mov.l @r1,r1
4001b4: 13 62 mov r1,r2
4001b6: 7c 72 add #124,r2
4001b8: 26 50 mov.l @(24,r2),r0
4001ba: 80 cb or #-128,r0
4001bc: 06 12 mov.l r0,@(24,r2)
4001be: 05 92 mov.w 4001cc <main+0x1c>,r2 ! bc
4001c0: 2c 31 add r2,r1
4001c2: 13 52 mov.l @(12,r1),r2
4001c4: 03 93 mov.w 4001ce <main+0x1e>,r3 ! 1000
4001c6: 3b 22 or r3,r2
4001c8: 0b 00 rts
4001ca: 23 11 mov.l r2,@(12,r1)
4001cc: bc 00 mov.b @(r0,r11),r0
4001ce: 00 10 mov.l r0,@(0,r0)
4001d0: 00 10 mov.l r0,@(0,r0)
4001d2: 41 00 .word 0x0041
00411000 <PORT>:
411000: 00 f0 .word 0xf000
411002: ca de mov.l 41132c <__bss_start+0x31c>,r14
#Instr | Code | Data | Total | ISA |
10 | 32 | 8 | 40 | Thumb2 |
10 | 40 | 8 | 48 | Arm32 |
15 | 30 | 10 | 40 | Thumb1 |
9 | 36 | 8 | 44 | Arm64 |
10 | 32 | 8 | 40 | RISC-V rv64ic |
10 | 32 | 4 | 36 | RISC-V rv32ic |
10 | 40 | 4 | 44 | RISC-V rv32i |
4 | 20 | 4 | 24 | M68k |
8 | 41 | 4 | 45 | i686 |
13 | 26 | 14 | 40 | SH4 |
My surprises show up when initializing periperals. I expected code like:Code: [Select]PORT->Group[0].PINCFG[12].reg |= PORT_PINCFG_DRVSTR;
PORT->Group[0].DIRSET.reg |= 1<<12;
Just for fun, I made a couple of definitions so your code would be compilable and tried it on a few things.Code: [Select]#include <stdint.h>
#define PORT_PINCFG_DRVSTR (1<<7)
struct {
struct {
struct {
uint32_t foo;
uint32_t reg;
uint32_t bar;
} PINCFG[16];
struct {
uint64_t baz;
uint32_t reg;
} DIRSET;
} Group[10];
} *PORT = (void*)0xdecaf000;
void main(){
PORT->Group[0].PINCFG[12].reg |= PORT_PINCFG_DRVSTR;
PORT->Group[0].DIRSET.reg |= 1<<12;
}
PORT->Group[0].DIRSET.reg = 1<<12; // no need for "|="
6685 lui a3,0x1
0ce7a423 sw a3,200(a5) ; replace "200" with correct offset from a5
bset LATA,#12
QuotePretty much everyone using ARC or Xtensa is likely to switch to RISC-VEspressif too? Is there any indication that the "mostly China" manufacturers would switch?
Not for ARM Cortex. The NVIC hardware saves exactly the same registers that the C ABI says must be saved, so effectively there is NO extra prolog for ISRs. But the NVIC hardware stacks 8 words of context, so it's slower than it could be if the choice was left to the programer.
It had 128 32-bit registers, and the convention was that the botttom 64 belonged to user code and the top 64 could be used by the ISR. No saving required.
QuoteQuoteI wish the ISRs in C code [on ARM] that the HW interrupt entry was quicker...Doesn't the 'naked' attribute of the function definition remove the prolog and epilog?Not for ARM Cortex. The NVIC hardware saves exactly the same registers that the C ABI says must be saved, so effectively there is NO extra prolog for ISRs. But the NVIC hardware stacks 8 words of context, so it's slower than it could be if the choice was left to the programer.
However, I think in the future, as everything moves to multi-cores, things may get even better. If you assign a designated core to an interrupt, then the core can simply sit there waiting for the interrupt to happen. Then there's no latency except for the short period necessary to synchronize the interrupt signal to the CPU clock.
Some modern MCUs have multiple register sets. When an interrupt happens, the new set gets loaded. When it quits, the old one gets restored. It doesn't take any additional time and thus decreases the interrupt latency by a lot. If you have a separate register set for every interrupt level, you never need to save anything.
Register banks do make code that need to access registers across priority levels a whole lot messier (eg. task switching using a low-priority interrupt, like is usually done on Cortex-M MCUs, or exception handlers). I guess with modern manufacturing processes the extra state required by the additional register banks isn't a big deal anymore (eg. 31 32-bit registers by 8 banks is a bit less than 1000 bytes).
However, I think in the future, as everything moves to multi-cores, things may get even better. If you assign a designated core to an interrupt, then the core can simply sit there waiting for the interrupt to happen. Then there's no latency except for the short period necessary to synchronize the interrupt signal to the CPU clock.The limiting factor here will be memory. You either need to have a dedicated memory per core, which will make the maximum size of the handler inflexible, or deal with concurrent access by multiple cores, which will slow down everything.
I have ideas for this too. Most of the cores should have very limited amount of dedicated regular memory, but they will have one or more deep hardware FIFOs. The other end of the FIFOs may be muxed to other cores, which provides wide address-less communication channels between cores. This removes bus congestion altogether. The central core (or cores), in contrast, will have bigger memory so they can process data.
That does not address code memory.
Code memory can be made completely separate from data memory.
... Further, an interrupt only happens when the user code makes a jump... An interesting and useful side-effect is that user code could assume no interrupts while doing code that needs to be atomic.
I just thought some might find this interesting.
[ARM Cortex NVIC register stacking] likely faster in the majority of cases
The register is called DIRSET because writing to it only sets the bits
The compiler may be clever enough to keep one of the registers permanently pointing to the IO registers area
Code memory can be made completely separate from data memory.That's exactly what I'm talking about. You will essentially limit what your "interrupt" handler can do by defining the amount of code memory it has. I think this will be enough of a limitation to make this system impractical. At least for common microcontroller uses.
Quote[ARM Cortex NVIC register stacking] likely faster in the majority of cases
I'm not convinced. We're talking register stacking, probably limited by memory speed, and taking all of 1 instruction (push multiple) in the ISR to save exactly which ones you need...
QuoteThe compiler may be clever enough to keep one of the registers permanently pointing to the IO registers areaMaybe. 32bit processors tend to really spread those IO registers out, perhaps occupying more than even a reasonable offset constant for indexed addressing.And constant-folding upper bits of an address might be too much to ask of a compiler. I remember looking at PIC32 code (MIPS), which loads 32bit constants half-at-a-time (LUI/ORI), and being disappointed that it it kept re-loading the same upper value.
OTOH, I think Microchip was defining those symbols at link time rather than in C source, so there wasn't much choice...
In SAM, "Group" represents a group of registers 128 bytes long and everything below is just unions.
"PORT" would be a fixed location in memory space. So, what the code actually does is setting 2 bits at the fixed memory location.
There's no pointer loading (which takes whopping 50% in Motorola, and 49% in Intel which you decided to compile as position-independent code).
Moreover, when someone builds an MCU with RISC-V, they will probably provide some way of setting bits without reading registers, as Atmel did here:Code: [Select]PORT->Group[0].DIRSET.reg = 1<<12; // no need for "|="
The register is called DIRSET because writing to it only sets the bits (and the bits which are written "0" remain unchanged), and there's an opposite register called DIRCLR which clears the bits, and also DIRTGL which xors.
The compiler may be clever enough to keep one of the registers permanently pointing to the IO registers area, so the whole thing boils down to this:Code: [Select]6685 lui a3,0x1
0ce7a423 sw a3,200(a5) ; replace "200" with correct offset from a5
A decade and a half ago, I had the pleasure of working with a VLIW processor, the Trimedia/Philips PNX1302. It dispatched up to 5 operations per instruction word at 200mhz. It had 128 32-bit registers, and the convention was that the botttom 64 belonged to user code and the top 64 could be used by the ISR. No saving required.
Further, an interrupt only happens when the user code makes a jump. So user code could (with care) use the top 64 between jumps. An interesting and useful side-effect is that user code could assume no interrupts while doing code that needs to be atomic.
It had 128 32-bit registers, and the convention was that the botttom 64 belonged to user code and the top 64 could be used by the ISR. No saving required.
Some modern MCUs have multiple register sets. When an interrupt happens, the new set gets loaded. When it quits, the old one gets restored. It doesn't take any additional time and thus decreases the interrupt latency by a lot. If you have a separate register set for every interrupt level, you never need to save anything.
QuoteQuoteI wish the ISRs in C code [on ARM] that the HW interrupt entry was quicker...Doesn't the 'naked' attribute of the function definition remove the prolog and epilog?Not for ARM Cortex. The NVIC hardware saves exactly the same registers that the C ABI says must be saved, so effectively there is NO extra prolog for ISRs. But the NVIC hardware stacks 8 words of context, so it's slower than it could be if the choice was left to the programer.
slower in the rare case you need to do something in a few cycles with no registers, likely faster in the majority of cases
Register banks do make code that need to access registers across priority levels a whole lot messier (eg. task switching using a low-priority interrupt, like is usually done on Cortex-M MCUs, or exception handlers). I guess with modern manufacturing processes the extra state required by the additional register banks isn't a big deal anymore (eg. 31 32-bit registers by 8 banks is a bit less than 1000 bytes).
It does not cost as much due to area now but the register bank is within the critical timing path for the pipeline so it limits performance in an aggressive design.