Just out of interest, does the output include formatting floating-point numbers? The standard libraries (newlib and others) are extremely slow in formatting and parsing them.
For Cortex-M0 with a "slow" multiplication, even integer formatting is slow in the standard libraries. (If you have a fast 32×32 multiply that returns the high 32 bits of the 64-bit result, you can divide by ten fast by multiplying by 3435973837 = 0xCCCCCCCD, and shifting the upper 32 bits right by 3 bits, to get the quotient in the high 32 bits.) Assuming ILP32 (aapcs32 on Cortex-M0), with 32-bit int and long, with long-long being 64-bit, repeated subtraction using decimal digits 1 and 3 can be much faster. Here is an example implementation:
// SPDX-License-Identifier: CC0-1.0 (Public Domain)
// Author: Nominal Animal, 2025.
#include <stddef.h> // for NULL
#include <stdint.h>
static const uint32_t decades_32bit[10][2] = {
{ UINT32_C(1), UINT32_C(3) },
{ UINT32_C(10), UINT32_C(30) },
{ UINT32_C(100), UINT32_C(300) },
{ UINT32_C(1000), UINT32_C(3000) },
{ UINT32_C(10000), UINT32_C(30000) },
{ UINT32_C(100000), UINT32_C(300000) },
{ UINT32_C(1000000), UINT32_C(3000000) },
{ UINT32_C(10000000), UINT32_C(30000000) },
{ UINT32_C(100000000), UINT32_C(300000000) },
{ UINT32_C(1000000000), UINT32_C(3000000000) },
};
static const uint64_t decades_64bit[11][2] = {
{ UINT64_C(1000000000), UINT64_C(3000000000) },
{ UINT64_C(10000000000), UINT64_C(30000000000) },
{ UINT64_C(100000000000), UINT64_C(300000000000) },
{ UINT64_C(1000000000000), UINT64_C(3000000000000) },
{ UINT64_C(10000000000000), UINT64_C(30000000000000) },
{ UINT64_C(100000000000000), UINT64_C(300000000000000) },
{ UINT64_C(1000000000000000), UINT64_C(3000000000000000) },
{ UINT64_C(10000000000000000), UINT64_C(30000000000000000) },
{ UINT64_C(100000000000000000), UINT64_C(300000000000000000) },
{ UINT64_C(1000000000000000000), UINT64_C(3000000000000000000) },
{ UINT64_C(10000000000000000000), UINT64_C( 0) },
};
// Internal 32-bit unsigned integer conversion routine.
static char *do_append_u32(char *buf, char *const end, uint32_t val) {
// Count the number of decimal digits.
int_fast8_t n = 0;
while (val >= decades_32bit[n+1][0])
if (++n >= 9)
break;
// Verify sufficient room in buffer.
if (buf + n > end)
return NULL;
// Convert to decimal digits via repeated subtraction.
do {
char digit = '0';
while (val >= decades_32bit[n][1]) {
val -= decades_32bit[n][1];
digit += 3;
}
while (val >= decades_32bit[n][0]) {
val -= decades_32bit[n][0];
digit += 1;
}
*(buf++) = digit;
} while (n-->0);
*buf = '\0';
return buf;
}
// Internal 64-bit unsigned integer conversion routine.
static char *do_append_u64(char *buf, char *const end, uint64_t val) {
// If fits in 32 bits, treat as 32-bit.
if ((uint64_t)(uint32_t)(val) == val)
return do_append_u32(buf, end, (uint32_t)val);
// Above test ensures val >= decades_64bit[0][0].
int_fast8_t n = 0;
while (val >= decades_64bit[n+1][0])
if (++n >= 10)
break;
// Verify sufficient room in buffer.
if (buf + n + 9 > end)
return NULL;
// The first decimal digit of 2^64-1 is 1, so we need to treat it specially.
if (n == 10) {
char digit = '0';
while (val >= decades_64bit[10][0]) {
val -= decades_64bit[10][0];
digit++;
}
*(buf++) = digit;
n--;
}
do {
char digit = '0';
while (val >= decades_64bit[n][1]) {
val -= decades_64bit[n][1];
digit += 3;
}
while (val >= decades_64bit[n][0]) {
val -= decades_64bit[n][0];
digit += 1;
}
*(buf++) = digit;
} while (n-->0);
// Add the nine 32-bit digits
uint32_t v32 = (uint32_t)val;
n = 8;
do {
char digit = '0';
while (v32 >= decades_32bit[n][1]) {
v32 -= decades_32bit[n][1];
digit += 3;
}
while (v32 >= decades_32bit[n][0]) {
v32 -= decades_32bit[n][0];
digit += 1;
}
*(buf++) = digit;
} while (n-->0);
*buf = '\0';
return buf;
}
// Convert an unsigned 32-bit integer (%u) to decimal string,
// and store to buf. Will not write past end (but may write nul to *end).
// Returns a pointer to the string-terminating nul byte.
char *append_u32(char *buf, char *const end, uint32_t val) {
// Abort if no buffer, or if buffer full.
if (!buf || buf >= end)
return NULL;
return do_append_u32(buf, end, val);
}
// Convert a signed 32-bit integer (%d) to decimal string,
// and store to buf. Will not write past end (but may write nul to *end).
// Returns a pointer to the string-terminating nul byte.
char *append_i32(char *buf, char *const end, int32_t val) {
if (val < 0) {
// Abort if no buffer, or if buffer full.
if (!buf || buf + 1 >= end)
return NULL;
// Prepend negative sign, negate, and treat as unsigned.
*buf = '-';
return do_append_u32(buf + 1, end, (uint32_t)(-val));
} else {
if (!buf || buf >= end)
return NULL;
// Nonnegative, so treat as unsigned.
return do_append_u32(buf, end, val);
}
}
// Convert an unsigned 64-bit integer (%llu) to decimal string,
// and store to buf. Will not write past end, but may write nul to *end.
// Returns a pointer to the string-terminating nul byte.
char *append_u64(char *buf, char *const end, uint64_t val) {
// Abort if no buffer, or if buffer full.
if (!buf || buf >= end)
return NULL;
return do_append_u64(buf, end, val);
}
// Convert a signed 64-bit integer (%lld) to decimal string,
// and store to buf. Will not write past end, but may write nul to *end.
// Returns a pointer to the string-terminating nul byte.
char *append_i64(char *buf, char *const end, int64_t val) {
if (val < 0) {
// Abort if no buffer, or if buffer full.
if (!buf || buf + 1 >= end)
return NULL;
// Prepend negative sign, negate, and treat as unsigned.
*buf = '-';
return do_append_u64(buf + 1, end, (uint64_t)(-val));
} else {
// Abort if no buffer, or if buffer full.
if (!buf || buf >= end)
return NULL;
// Nonnegative, so treat as unsigned.
return do_append_u64(buf, end, val);
}
}
The idea in the end = append_type(dest, last, value); interface is to efficiently append the decimal value to the buffer. When the value does not fit, it will return NULL (and append_type(NULL,...) is safe and will also return NULL). You can always call the function with dest pointing to the next free character in your output buffer, and last pointing to the last character in that buffer, and if the function returns non-NULL, it points to the next dest, otherwise it did not fit.
(That is, you can at any point safely try to append a new substring to your buffer. If it returns NULL, it didn't modify more than the start character (which should be the first free character in the buffer anyway), and that only when the value is negative; you can easily move the sign to after the digits have been filled, to ensure no modification is done. You can also remove all the 64-bit stuff, if you don't use long long, uint64_t, int64_t, uintmax_t, or intmax_t types.)
Each digit requires 2.1 iterations on average (0 1 2 1 2 3 2 3 4 3, to be exact, per possible decimal digit), with each iteration consisting of one subtraction and one addition. There are no multiplications or divisions at all (except for calculating the look-up array addresses, which are bit shifts), so this approach is suitable for slow-multiplication architectures in general, including 8-bitters (although they can benefit from adding 8- and 16-bit converters also).
This is not magic, though. On x86-64 with fast multiplication, even the standard snprintf() is about 2.4× faster in converting 64-bit unsigned numbers (because it has a fast hardware 64×64=128-bit integer multiplication, so that (x/10) is implemented as (x*0xCCCCCCCCCCCCCCCD)>>67). On the other hand, for 32-bit unsigned integers (append_u32() and append_i32() versus snprintf("%u") and snprintf("%d")), standard snprintf() is about 2.6× slower only about 1.5× faster (edited due to my laptop frequency scaling skewed the original results). (These are on a microbenchmark covering all 32-bit unsigned integers uniformly randomly; if you typically print a smaller range of values, expect different results.)