ATSAMC21 has PORTx.OUT, PORTx.OUTSET, PORTx.OUTCLR, and PORTx.OUTTGL registers to set the output pin states; PORTx.DIR, PORTx.DIRSET, PORTx.DIRCLR, and PORTx.DIRTGL to set the pin direction. Only when reading pin states, do you need any bit operations. SAMC21 is a Cortex-M0+ with a single-cycle multiplier, so we can do something pretty crafty:
#include <stdint.h>
/* TODO: Verify these addresses! */
#define PORTA_DIRCLR (*(volatile uint32_t *)0x60000004)
#define PORTA_DIRSET (*(volatile uint32_t *)0x60000008)
#define PORTA_DIRTGL (*(volatile uint32_t *)0x6000000C)
#define PORTA_OUTCLR (*(volatile uint32_t *)0x60000014)
#define PORTA_OUTSET (*(volatile uint32_t *)0x60000018)
#define PORTA_OUTTGL (*(volatile uint32_t *)0x6000001C)
#define PORTA_IN (*(volatile uint32_t *)0x60000020)
#define PORTB_DIRCLR (*(volatile uint32_t *)0x60000084)
#define PORTB_DIRSET (*(volatile uint32_t *)0x60000088)
#define PORTB_DIRTGL (*(volatile uint32_t *)0x6000008C)
#define PORTB_OUTCLR (*(volatile uint32_t *)0x60000094)
#define PORTB_OUTSET (*(volatile uint32_t *)0x60000098)
#define PORTB_OUTTGL (*(volatile uint32_t *)0x6000009C)
#define PORTB_IN (*(volatile uint32_t *)0x600000A0)
/* Pin configuration, 640 bytes. */
uint32_t pin_mask[40][2];
uint32_t pin_mult[40][2];
/* PA00 = 0, PA31 = 31, PB00 = 32, PB31 = 63. */
int pin_define(const int pin, const int num)
{
/* Safety check */
if (pin < 0 || pin >= 40 || num < 0 || num >= 64)
return -1;
if (num < 32) {
pin_mask[pin][0] = ((uint32_t)1) << num;
pin_mask[pin][1] = 0;
pin_mult[pin][0] = ((uint32_t)1) << (31 - num);
pin_mult[pin][1] = 0;
} else {
pin_mask[pin][0] = 0;
pin_mask[pin][1] = ((uint32_t)1) << (num - 32);
pin_mult[pin][0] = 0;
pin_mult[pin][1] = ((uint32_t)1) << (63 - num);
}
return 0;
}
void pin_mode_in(const int pin) { PORTA_DIRCLR = pin_mask[pin][0]; PORTB_DIRCLR = pin_mask[pin][1]; }
void pin_mode_out(const int pin) { PORTA_DIRSET = pin_mask[pin][0]; PORTB_DIRSET = pin_mask[pin][1]; }
void pin_mode_tgl(const int pin) { PORTA_DIRTGL = pin_mask[pin][0]; PORTB_DIRTGL = pin_mask[pin][1]; }
void pin_mode(const int pin, const int mode)
{
if (mode & 1) {
pin_mode_out(pin);
} else {
pin_mode_in(pin);
/* TODO: pullups etc., per additional mode bits */
}
}
void pin_out_set(const int pin) { PORTA_OUTSET = pin_mask[pin][0]; PORTB_OUTSET = pin_mask[pin][1]; }
void pin_out_clr(const int pin) { PORTA_OUTCLR = pin_mask[pin][0]; PORTB_OUTCLR = pin_mask[pin][1]; }
void pin_out_tgl(const int pin) { PORTA_OUTTGL = pin_mask[pin][0]; PORTB_OUTTGL = pin_mask[pin][1]; }
void pin_out(const int pin, const int state)
{
if (state) {
pin_out_set(pin);
} else {
pin_out_clr(pin);
}
}
uint32_t pin_in(const int pin)
{
return !!(((PORTA_IN * pin_mult[pin][0]) | (PORTB_IN * pin_mult[pin][1])) & 0x80000000);
}
which using arm-gcc 5.4.1 (-Wall -Os -mcpu=cortex-m0plus -mthumb) generates
.syntax unified
.cpu cortex-m0plus
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 1
.eabi_attribute 30, 4
.eabi_attribute 34, 0
.eabi_attribute 18, 4
.thumb
.syntax unified
.file "ops.c"
.text
.align 1
.global pin_define
.code 16
.thumb_func
.type pin_define, %function
pin_define:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, r5, r6, r7, lr}
cmp r0, #39
bhi .L5
cmp r1, #63
bhi .L5
ldr r2, .L7
lsls r3, r0, #3
ldr r5, .L7+4
cmp r1, #31
bgt .L3
movs r4, #1
movs r0, r4
lsls r0, r0, r1
str r0, [r2, r3]
movs r0, #0
adds r2, r2, r3
str r0, [r2, #4]
movs r2, #31
subs r1, r2, r1
lsls r4, r4, r1
str r4, [r5, r3]
adds r3, r5, r3
str r0, [r3, #4]
b .L2
.L3:
movs r4, #1
movs r6, r1
movs r7, r4
movs r0, #0
subs r6, r6, #32
lsls r7, r7, r6
str r0, [r2, r3]
adds r2, r2, r3
str r7, [r2, #4]
movs r2, #63
subs r1, r2, r1
lsls r4, r4, r1
str r0, [r5, r3]
adds r3, r5, r3
str r4, [r3, #4]
b .L2
.L5:
movs r0, #1
rsbs r0, r0, #0
.L2:
@ sp needed
pop {r4, r5, r6, r7, pc}
.L8:
.align 2
.L7:
.word pin_mask
.word pin_mult
.size pin_define, .-pin_define
.align 1
.global pin_mode_in
.code 16
.thumb_func
.type pin_mode_in, %function
pin_mode_in:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r3, .L10
lsls r0, r0, #3
ldr r1, [r0, r3]
ldr r2, .L10+4
adds r0, r3, r0
str r1, [r2]
ldr r2, [r0, #4]
ldr r3, .L10+8
@ sp needed
str r2, [r3]
bx lr
.L11:
.align 2
.L10:
.word pin_mask
.word 1610612740
.word 1610612868
.size pin_mode_in, .-pin_mode_in
.align 1
.global pin_mode_out
.code 16
.thumb_func
.type pin_mode_out, %function
pin_mode_out:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r3, .L13
lsls r0, r0, #3
ldr r1, [r0, r3]
ldr r2, .L13+4
adds r0, r3, r0
str r1, [r2]
ldr r2, [r0, #4]
ldr r3, .L13+8
@ sp needed
str r2, [r3]
bx lr
.L14:
.align 2
.L13:
.word pin_mask
.word 1610612744
.word 1610612872
.size pin_mode_out, .-pin_mode_out
.align 1
.global pin_mode_tgl
.code 16
.thumb_func
.type pin_mode_tgl, %function
pin_mode_tgl:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r3, .L16
lsls r0, r0, #3
ldr r1, [r0, r3]
ldr r2, .L16+4
adds r0, r3, r0
str r1, [r2]
ldr r2, [r0, #4]
ldr r3, .L16+8
@ sp needed
str r2, [r3]
bx lr
.L17:
.align 2
.L16:
.word pin_mask
.word 1610612748
.word 1610612876
.size pin_mode_tgl, .-pin_mode_tgl
.align 1
.global pin_mode
.code 16
.thumb_func
.type pin_mode, %function
pin_mode:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, lr}
lsls r3, r1, #31
bpl .L19
bl pin_mode_out
b .L18
.L19:
bl pin_mode_in
.L18:
@ sp needed
pop {r4, pc}
.size pin_mode, .-pin_mode
.align 1
.global pin_out_set
.code 16
.thumb_func
.type pin_out_set, %function
pin_out_set:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r3, .L22
lsls r0, r0, #3
ldr r1, [r0, r3]
ldr r2, .L22+4
adds r0, r3, r0
str r1, [r2]
ldr r2, [r0, #4]
ldr r3, .L22+8
@ sp needed
str r2, [r3]
bx lr
.L23:
.align 2
.L22:
.word pin_mask
.word 1610612760
.word 1610612888
.size pin_out_set, .-pin_out_set
.align 1
.global pin_out_clr
.code 16
.thumb_func
.type pin_out_clr, %function
pin_out_clr:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r3, .L25
lsls r0, r0, #3
ldr r1, [r0, r3]
ldr r2, .L25+4
adds r0, r3, r0
str r1, [r2]
ldr r2, [r0, #4]
ldr r3, .L25+8
@ sp needed
str r2, [r3]
bx lr
.L26:
.align 2
.L25:
.word pin_mask
.word 1610612756
.word 1610612884
.size pin_out_clr, .-pin_out_clr
.align 1
.global pin_out_tgl
.code 16
.thumb_func
.type pin_out_tgl, %function
pin_out_tgl:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r3, .L28
lsls r0, r0, #3
ldr r1, [r0, r3]
ldr r2, .L28+4
adds r0, r3, r0
str r1, [r2]
ldr r2, [r0, #4]
ldr r3, .L28+8
@ sp needed
str r2, [r3]
bx lr
.L29:
.align 2
.L28:
.word pin_mask
.word 1610612764
.word 1610612892
.size pin_out_tgl, .-pin_out_tgl
.align 1
.global pin_out
.code 16
.thumb_func
.type pin_out, %function
pin_out:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, lr}
cmp r1, #0
beq .L31
bl pin_out_set
b .L30
.L31:
bl pin_out_clr
.L30:
@ sp needed
pop {r4, pc}
.size pin_out, .-pin_out
.align 1
.global pin_in
.code 16
.thumb_func
.type pin_in, %function
pin_in:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
movs r2, r0
ldr r3, .L34
push {r4, lr}
ldr r4, [r3]
ldr r3, .L34+4
ldr r1, .L34+8
ldr r0, [r3]
lsls r3, r2, #3
ldr r2, [r3, r1]
adds r3, r1, r3
ldr r3, [r3, #4]
muls r2, r4
muls r0, r3
orrs r0, r2
lsrs r0, r0, #31
@ sp needed
pop {r4, pc}
.L35:
.align 2
.L34:
.word 1610612768
.word 1610612896
.word pin_mult
.size pin_in, .-pin_in
.comm pin_mult,320,4
.comm pin_mask,320,4
.ident "GCC: (GNU Tools for ARM Embedded Processors) 5.4.1 20160919 (release) [ARM/embedded-5-branch revision 240496]"
(exact output, only whitespace modified for easier reading).
Essentially, pin_mask[logical][bank] contains the bit mask – only one bit set per logical pin! – used with clear/set/toggle registers (when setting pin direction or output state); and pin_mult[logical][bank] contains a multiplier that shifts the desired bit to the most significant position, or zero if the bank does not affect the logical pin state, for use when reading the pin states. You have 40 logical pins, and the SAMC21 has GPIO pins in two logical banks, so these lookup tables do take 640 bytes of SRAM.
Is this fast enough for you? I doubt you can get 3 MHz with a 48 MHz MCU – that is 1:12 – but it is not that much work.
A similar approach – using a bit mask per pin per bank and CLR/SET/TGL registers; and a multiplier to "shift" the input bit to the highest bit position but still being able to clear the result to zero – works on many other ARMs as well. Note that instead of a multiplier, you can use a shift count, but only if the shift instruction supports clearing the entire register (shift down by 32), or if you do not use the pin corresponding to bit 0 in any bank (so that you can add an explicit additional shift right). But, when you have a single-cycle 32×32 multiplication instruction, it makes sense to use it to ones advantage.
If I was doing this, I'd prototype it using Teensy 4.1 with a similar scheme (with four GPIO banks, not two). It has an i.MX RT1062 running at up to 600 MHz (~960 MHz if overclocked and well cooled). That'd at least tell oneself how much computing power is actually needed, even if one didn't end up using that particular processor.