Binary to decimal BCD conversion routines have many applications. The first application I give them in new systems is to send by UART binary numbers to the computer in decimal format.
The code is written in C for Arduino, but is easily translatable to another environment.
16 bits binary to 5 digits BCD:
#define BINARY_DIGITS 16
#define DECIMAL_DIGITS 5
void bin16_to_str(char *str, unsigned int bin) {
char maxdig, digcnt, bitcnt;
static char *p, carry;
// Clear string
p = str;
digcnt = DECIMAL_DIGITS + 1;
do { *p++ = 0; } while (--digcnt);
// Transform binary to BCD
bitcnt = BINARY_DIGITS;
maxdig = (DECIMAL_DIGITS - 1);
str += (DECIMAL_DIGITS - 1);
do {
// Shift left binary number with carry
carry = 0;
if (bin & (1L<<(BINARY_DIGITS-1)))
carry |= 1;
bin <<= 1;
// Shift left decimal number with carry
p = str;
digcnt = DECIMAL_DIGITS - maxdig;
do {
carry = (*p<<1) + carry;
if (carry >= 10) {
*p-- = carry - 10;
carry = 1;
if (digcnt == 1) {
maxdig--;
digcnt++;
}
}
else {
*p-- = carry;
carry = 0;
}
} while(--digcnt);
} while(--bitcnt);
// Transform BCD to ASCII
digcnt = DECIMAL_DIGITS;
do *str-- += '0'; while (--digcnt);
}
void setup() {
Serial.begin(9600);
}
void loop() {
unsigned long timeit;
unsigned long bin;
char strnum[DECIMAL_DIGITS + 1];
strnum[DECIMAL_DIGITS] = 0; // End of char
timeit = millis();
for(bin=10000; bin<11000; bin++) {
bin16_to_str(strnum, bin);
}
timeit = millis() - timeit;
Serial.print("1 conversion in ");
Serial.print(timeit);
Serial.println(" microseconds");
delay(5000);
for(bin=0; bin<50000; bin++) {
bin16_to_str(strnum, bin);
Serial.println(strnum);
}
}
Output:
1 Conversion in 61 microseconds
00000
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
Size of flash memory (function bin16_to_str) = 2084 - 1916 = 168 Bytes
=======================================================
32 bits binary to 10 digits BCD:
#define BINARY_DIGITS 32
#define DECIMAL_DIGITS 10
void bin32_to_str(char *str, unsigned long bin) {
char maxdig, digcnt, bitcnt;
static char *p, carry;
// Clear string
p = str;
digcnt = DECIMAL_DIGITS + 1;
do { *p++ = 0; } while (--digcnt);
// Transform binary to BCD
bitcnt = BINARY_DIGITS;
maxdig = (DECIMAL_DIGITS - 1);
str += (DECIMAL_DIGITS - 1);
do {
// Shift left binary number with carry
carry = 0;
if (bin & (1L<<(BINARY_DIGITS-1)))
carry |= 1;
bin <<= 1;
// Shift left decimal number with carry
p = str;
digcnt = DECIMAL_DIGITS - maxdig;
do {
carry = (*p<<1) + carry;
if (carry >= 10) {
*p-- = carry - 10;
carry = 1;
if (digcnt == 1) {
maxdig--;
digcnt++;
}
}
else {
*p-- = carry;
carry = 0;
}
} while(--digcnt);
} while(--bitcnt);
// Transform BCD to ASCII
digcnt = DECIMAL_DIGITS;
do *str-- += '0'; while (--digcnt);
}
void setup() {
Serial.begin(9600);
}
void loop() {
unsigned long timeit;
unsigned long bin;
char strnum[DECIMAL_DIGITS + 1];
strnum[DECIMAL_DIGITS] = 0; // End of char
timeit = millis();
for(bin=0x80000000; bin<0x80000000+1000; bin++) {
bin32_to_str(strnum, bin);
}
timeit = millis() - timeit;
Serial.print("1 Conversion in ");
Serial.print(timeit);
Serial.println(" microseconds");
delay(5000);
for(bin=0; bin<100000; bin++) {
bin32_to_str(strnum, bin);
Serial.println(strnum);
}
}
Output:
1 Conversion in 234 microseconds
0000000000
0000000001
0000000002
0000000003
0000000004
0000000005
0000000006
0000000007
0000000008
0000000009
0000000010
0000000011
0000000012
0000000013
0000000014
0000000015
0000000016
0000000017
0000000018
0000000019
0000000020
0000000021
0000000022
0000000023
0000000024
0000000025
0000000026
0000000027
0000000028
0000000029
0000000030
0000000031
0000000032
0000000033
0000000034
0000000035
0000000036
0000000037
0000000038
0000000039
0000000040
0000000041
0000000042
0000000043
0000000044
0000000045
0000000046
0000000047
0000000048
0000000049
0000000050
0000000051
0000000052
0000000053
0000000054
0000000055
Size of flash memory (function bin32_to_str)= 2154 - 1926 = 228 Bytes
8 bits binary to 3 digits BCD:
#define BINARY_DIGITS 8
#define DECIMAL_DIGITS 3
void bin8_to_str(char *str, unsigned char bin) {
char maxdig, digcnt, bitcnt;
static char *p, carry;
// Clear string
p = str;
digcnt = DECIMAL_DIGITS + 1;
do { *p++ = 0; } while (--digcnt);
// Transform binary to BCD
bitcnt = BINARY_DIGITS;
maxdig = (DECIMAL_DIGITS - 1);
str += (DECIMAL_DIGITS - 1);
do {
// Shift left binary number with carry
carry = 0;
if (bin & (1L<<(BINARY_DIGITS-1)))
carry |= 1;
bin <<= 1;
// Shift left decimal number with carry
p = str;
digcnt = DECIMAL_DIGITS - maxdig;
do {
carry = (*p<<1) + carry;
if (carry >= 10) {
*p-- = carry - 10;
carry = 1;
if (digcnt == 1) {
maxdig--;
digcnt++;
}
}
else {
*p-- = carry;
carry = 0;
}
} while(--digcnt);
} while(--bitcnt);
// Transform BCD to ASCII
digcnt = DECIMAL_DIGITS;
do *str-- += '0'; while (--digcnt);
}
void setup() {
Serial.begin(9600);
}
void loop() {
unsigned long timeit;
unsigned char bin;
char strnum[DECIMAL_DIGITS + 1];
strnum[DECIMAL_DIGITS] = 0; // End of char
timeit = millis();
for(bin=0; bin<255; bin++) {
bin8_to_str(strnum, bin);
}
timeit = millis() - timeit;
Serial.print("1 conversion in ");
Serial.print(timeit/2.55);
Serial.println(" microseconds");
delay(5000);
for(bin=0; bin<255; bin++) {
bin8_to_str(strnum, bin);
Serial.println(strnum);
}
}
1 conversion in 2.75 microseconds
000
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
This is one of my standrd library functions. Edit as required...
/* Convert Binary to Bcd, Msd First */
/* -------------------------------- */
void vBin2Bcd (U32 u32Value, U8 *pu8BcdArray, U8 u8Count) { /* BcdArray has msd at array[0] on return */
U8 u8Index;
if (u32Value > 0) {
for (u8Index = 1; u8Index <= u8Count; u8Index++) {
pu8BcdArray [u8Count - u8Index] = (U8) (u32Value % 10); /* Buffer remainder */
u32Value /= 10; /* Next decade */
}
}
else { /* Don't convert zero */
for (u8Index = 0; u8Index < u8Count; u8Index++) {
pu8BcdArray [u8Index] = 0; /* Clear output */
}
}
}
A printing version:
/* Convert and Display 32 Bit Value as Decimal */
/* ------------------------------------------- */
void vPutDecimal (U32 u32Value) {
U8 u8Count = BUF_SIZE - 1;
u8Buffer [u8Count--] = 0x00; /* Terminate string */
do { /* First, split number into individual digits */
u8Buffer [u8Count--] = (U8) ((u32Value % 10) + 0x30); /* Get remainder, asciify & buffer */
} while ((u32Value /= 10) > 0 && u8Count > 0); /* Next decade, trap buffer overflow */
u8Count++; /* Correct count */
vPutString (&u8Buffer [u8Count]); /* Display, left justified */
}
Iirc, originally used that sort of idea in 6502 asm, but yes, like the old debate around integer divide and multiply, the old shift and add / subtract method can save a lot of memory, math lib inclusion etc. Modern micros are not so encumbered though, orders or magnitude faster, perhaps integer multiply and divide, and more flash space, so tend to think clarity rather than memory economy.
For 8-bit microcontrollers like AVRs, I'd suggest repeated substraction instead. For example,
// SPDX-License-Identifier: CC0-1.0
#include <stdint.h>
static const uint32_t u32_dec[20] = {
UINT32_C(1),
UINT32_C(3),
UINT32_C(10),
UINT32_C(30),
UINT32_C(100),
UINT32_C(300),
UINT32_C(1000),
UINT32_C(3000),
UINT32_C(10000),
UINT32_C(30000),
UINT32_C(100000),
UINT32_C(300000),
UINT32_C(1000000),
UINT32_C(3000000),
UINT32_C(10000000),
UINT32_C(30000000),
UINT32_C(100000000),
UINT32_C(300000000),
UINT32_C(1000000000),
UINT32_C(3000000000),
};
// Little-endian byte order
#define BYTEINDEX(n, m) (n)
// Big-endian byte order
// #define BYTEINDEX(n, m) ((m)-(n))
// Little-endian byte order
static const uint8_t u32_bcd[20][2] = {
{ BYTEINDEX(0, 4), 0x01 },
{ BYTEINDEX(0, 4), 0x03 },
{ BYTEINDEX(0, 4), 0x10 },
{ BYTEINDEX(0, 4), 0x30 },
{ BYTEINDEX(1, 4), 0x01 },
{ BYTEINDEX(1, 4), 0x03 },
{ BYTEINDEX(1, 4), 0x10 },
{ BYTEINDEX(1, 4), 0x30 },
{ BYTEINDEX(2, 4), 0x01 },
{ BYTEINDEX(2, 4), 0x03 },
{ BYTEINDEX(2, 4), 0x10 },
{ BYTEINDEX(2, 4), 0x30 },
{ BYTEINDEX(3, 4), 0x01 },
{ BYTEINDEX(3, 4), 0x03 },
{ BYTEINDEX(3, 4), 0x10 },
{ BYTEINDEX(3, 4), 0x30 },
{ BYTEINDEX(4, 4), 0x01 },
{ BYTEINDEX(4, 4), 0x03 },
{ BYTEINDEX(4, 4), 0x10 },
{ BYTEINDEX(4, 4), 0x30 },
};
void binary32_to_bcd(uint8_t dst[5], uint32_t src)
{
dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = 0;
int8_t i = 20;
while (i-->0)
while (src >= u32_dec[i]) {
src -= u32_dec[i];
dst[u32_bcd[i][0]] += u32_bcd[i][1];
}
}
takes about 258 bytes of Flash on AVR (gcc-5.4.0 -O2 -mmcu=atmega32u4), with the slowest case being 2888888888, 38 iterations of the inner loop total.
If you don't mind increasing the slowest case to 3999999999, 85 iterations of the inner loop total, you can use
// SPDX-License-Identifier: CC0-1.0
#include <stdint.h>
static const uint32_t u32_dec[10] = {
UINT32_C(1),
UINT32_C(10),
UINT32_C(100),
UINT32_C(1000),
UINT32_C(10000),
UINT32_C(100000),
UINT32_C(1000000),
UINT32_C(10000000),
UINT32_C(100000000),
UINT32_C(1000000000),
};
// Little-endian byte order
#define BYTEINDEX(n, m) (n)
// Big-endian byte order
// #define BYTEINDEX(n, m) ((m)-(n))
// Little-endian byte order
static const uint8_t u32_bcd[10][2] = {
{ BYTEINDEX(0, 4), 0x01 },
{ BYTEINDEX(0, 4), 0x10 },
{ BYTEINDEX(1, 4), 0x01 },
{ BYTEINDEX(1, 4), 0x10 },
{ BYTEINDEX(2, 4), 0x01 },
{ BYTEINDEX(2, 4), 0x10 },
{ BYTEINDEX(3, 4), 0x01 },
{ BYTEINDEX(3, 4), 0x10 },
{ BYTEINDEX(4, 4), 0x01 },
{ BYTEINDEX(4, 4), 0x10 },
};
void binary32_to_bcd(uint8_t dst[5], uint32_t src)
{
dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = 0;
int8_t i = sizeof u32_dec / sizeof u32_dec[0];
while (i-->0)
while (src >= u32_dec[i]) {
src -= u32_dec[i];
dst[u32_bcd[i][0]] += u32_bcd[i][1];
}
}
which compiles to 198 bytes of flash (gcc-5.4.0, -O2 -mmcu=atmega32u4).
The first application I give them in new systems is to send by UART binary numbers to the computer in decimal format.
The first thing I get any new mcu to do is to use the pins, then use the uart via printf or similar. Once in place you are set for any kind of formatting you want, with no need to create bits and pieces of code that attempts to do the same. The size, once in place, remains the same so the cost is one time and 'free' to use after the first use so use it everywhere you can.
https://godbolt.org/z/cMffYMzf4A little over 2k in size, of which printf is probably 3/4 of that. If my avr couldn't handle the extra 1.5k, I would move up to the next avr flash size. Having this is place is much more valuable than the loss of 1.5k (of which you replace with your bits/pieces of code anyway, so more like 1k).
I also have a C++ header for cout style printing, and the example above would be like so (and be about 500 bytes smaller in size)-
uart, '[', dec_(10,millis()), "] i: ", i++, endl; // using , instead of << and dec_ is decimal with width of 10 and leading spaces
a header with about 500 lines of code, but is basically 2 relatively simple functions that do the work (excluding floating point, which is another function)- format a number to something readable using already set formatting options and output a string, again using width/align options already in place (everything ends up in this).
Unless you like the challenge of fitting max code into a small mcu (like 2k), just get a printf in place and output whatever you desire and you can spend more time doing things like learning printf syntax.
Note that unless you have to deal with a lot of digits, on small targets, a very simple approach is often not just much simpler, but also actually faster.
You just need to count each digit by powers of ten. It's a very crude division algorithm, but for this purpose it works quite well. Nominal showed an improved version of that, but one that will take less flash (likely) is just to directly count in decimal.
If you just need a few digits, no need to get more complicated than this. You just need an array of the powers of 10 corresponding to the number of digits you need to deal with. It can even be unrolled if the number of digit is small. Say for 4 digits:
nDigit1000 = 0;
while (n >= 1000)
{
nDigit1000++;
n -= 1000;
}
nDigit100 = 0;
while (n >= 100)
{
nDigit100++;
n -= 100;
}
... and so on. If the number of digits is greater than this, or you want something more compact and easier to scale up, you can use arrays instead of this loop unrolling.
You can use Nominal's approach for something a bit more efficient.
But the direct decimal subtraction is often the simplest and fastest one on small targets that don't even have a hardware multiplier (or that have one that's pretty slow) with which optimizing compilers will usually implement the dvision by 10, if you go for the mod 10, divide by 10 approach. It may look more appropriate, but it rarely is on small embedded targets.
Have probably posted this before somewhere in the past, but as long as it's topical...
You can divide by multiplying by a reciprocal, assuming the reciprocal is known of course. This even allows obtaining the remainder.
Something I wrote years ago:
;
; uDivTen
;
; Unsigned division by ten. 45 words, takes 60 cycles (plus rcall).
;
; Input:
; r17:r16 = operand
; Output:
; r1:r0 = quotient
; r3:r2 = remainder
; (r16 = 10, r17 = 0)
;
uDivTen:
push r7 ; register usage:
push r6 ; r7:r6 = operand (in[0..1])
push r5 ; r5:r4:r3:r2 = 40 bit accumulator, D[1..3] (byte 0 discarded)
push r4 ; r1:r0 = partial products
movw r6, r16 ; r16 = temp, r17 = zero
clr r17 ; (but first, save operand)
; multiply by K = 2^20 * 16 / 10 to make D[0..3] = in[0..1] * K[0..2]
; (implicitly discarding D[0])
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16
mul r7, r16 ; in[1] * K[2]
movw r4, r0 ; save high word
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 0)) >> 0
mul r7, r16 ; in[1] * K[0]
movw r2, r0 ; save low word
mul r6, r16 ; in[0] * K[0]
add r2, r1 ; accumulate to D[1] (discard lowest byte)
adc r3, r17
adc r4, r17
adc r5, r17
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 8)) >> 8
mul r6, r16 ; in[0] * K[1]
add r2, r0 ; accumulate to D[1..2]
adc r3, r1
adc r4, r17
adc r5, r17
mul r7, r16 ; in[1] * K[1]
add r3, r0 ; accumulate to D[2..3]
adc r4, r1
adc r5, r17
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16
mul r6, r16 ; in[0] * K[2]
add r3, r0 ; accumulate to D[2..3]
adc r4, r1
adc r5, r17
; dig remainder out of the fractional part
ldi r16, 0x10 ; rounding bit
add r3, r16
ldi r16, 10
mul r3, r16 ; frac * 10
mov r2, r1
clr r3
movw r0, r4 ; quotient out
; r3 = 0, r2 = [0...9], r1:r0 = [0...6553]
pop r4
pop r5
pop r6
pop r7
ret
; END PROC uDivTen
Using an ad-hoc / non-standard ABI, but that's easily adapted by those in the know. That "40 bit" accumulator in the middle seems intimidating, or over-the-top, but IIRC, one or two extra bits are required to complete the full result (including remainder) and making use of the register is most likely better than any bit twiddling hacks you might do otherwise. (And, good luck getting the compiler to generate that..!)
I've never cared for BCD, and just do the x / 10, x % 10 loop to generate decimals. You'd have to be running a lot of digits to need any kind of speed. You're most likely doing it in the main() loop at a low refresh rate, for which any method will do -- my most recent codebases just let the C compiler figure it out, or use a library call such as itoa(). I haven't ran out of Flash or SRAM in a long time (...or ever?), not that I do very large projects, but still, a typical 64k AVR can hold a lot of crufty code no problem.
Tim
For ARM, shift-and-add/subtract is probably the most efficient since it can do both in a single instruction.
I have a small utility at
https://github.com/bson/divfac which factors unsigned integer division by a constant into shift-and-add. For example:
$ divfac.py 10 div10
template <typename T>
static inline T div10(const T& val) {
return (val >> 3) - (val >> 5) + (val >> 7) - (val >> 9) + (val >> 11) - (val >> 13) + (val >> 15) - (val >> 17) + (val >> 19) - (val >> 21) + (val >> 23) - (val >> 25) + (val >> 27) - (val >> 29) + (val >> 31);
}
Obviously it spits out C++ in a way to allow running it from makefiles to easily generate header files to be included for common divisors.
There's clearly a pattern here that would make it easy to manually extend to 64 bits, but personally I've never had a use for that. For smaller T's like uint16_t, many of the terms are 0 and get eliminated during optimization.
With inline ARM assembly, a 32-bit divide-by-10 could look something like:
#include <stdint.h>
static inline uint32_t div10(const uint32_t value) {
uint32_t result;
asm("lsr %0, %1, #3\n\t"
"sub %0, %1, lsr #5\n\t"
"add %0, %1, lsr #7\n\t"
"sub %0, %1, lsr #9\n\t"
"add %0, %1, lsr #11\n\t"
"sub %0, %1, lsr #13\n\t"
"add %0, %1, lsr #15\n\t"
"sub %0, %1, lsr #17\n\t"
"add %0, %1, lsr #19\n\t"
"sub %0, %1, lsr #21\n\t"
"add %0, %1, lsr #23\n\t"
"sub %0, %1, lsr #25\n\t"
"add %0, %1, lsr #27\n\t"
"sub %0, %1, lsr #29\n\t"
"add %0, %1, lsr #31"
: "+r" (result)
: "r" (value));
return result;
}
Of course, a 16-bit divide would be shorter since half the terms can be removed.
I would use ldiv( ) It does a division, and returns both the answer, and the remainer in a structure. You can also use a standard function such as itoa( ) (Integer to Ascii). Also studying the code in the standard libraries (from an Open Source compiler such as GCC) can be an excellent educational tool for leaning different types of algorithms used in programming.
Below a function I wrote years ago. Most of it is house keeping and ascii formatting (sings, dots, etc) to convert an integer into a fixed point interpretated ascii string. The while loop in the middle does the actual conversion with ldiv( ). Do note it writes the result "backwards" to the destination string (array).
// String Print a number with Fixed width and precision (decimal point).
// Signed version.
char* SPrintFixed( char* Buf, int32_t n, int8_t length, int8_t Precision) {
char Sign, *pEnd = Buf + length;
ldiv_t Number;
if( n < 0) {
n = -n;
Sign = '-';
}
else
Sign = ' ';
Buf[ length] = 0x00;
Number.quot = n;
do {
Number = ldiv( Number.quot, 10);
Buf[--length] = Number.rem + '0';
if( --Precision == 0)
Buf[--length] = '.';
}while( (length > 0) && ((Number.quot) || (Precision >= 0)) );
if( Sign == '-') {
if( length > 0) {
Buf[--length] = '-';
}
}
while( length > 0) { // Fill remaining space with spaces.
Buf[--length] = ' ';
}
return pEnd;
}
Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?
I mean compiler writers are not unaware of these things and may already generate optimal code given simple directions in the form of user code. In the case of an mcu with hardware division, letting the compiler decide means you will be letting the compiler use the hardware division instead of forcing a call to an ldiv library function, or some other user created division function, which will be much slower and therefore defeats the purpose of the extra effort to optimize on your own.
Simple example for an avr-
https://godbolt.org/z/TsvWeTof4They both make a single call to __divmodhi4 and the compiler sees the need for both q/r in the 'normal' version even though the div/mod statements were separated by code. Writing straight forward code in this case results in (almost) exactly the same code.
A second example, but for a cortex-m
https://godbolt.org/z/GEbY8n1nnIn this case, the compiler for whatever reason needs the divisor in a global var before it will do only the single __aeabi_idivmod call, else will get an additional __aeabi_idiv call. Even if you do not know this quirk and end up with 2 division calls its not something to worry about- you will not notice any effects that your formatting code is not optimal. Once you move over to an M3 and above, the compiler will start using hardware division and you had to do nothing to get that, where the ldiv (or any custom code) user will still be using the slow library call as the compiler will be forced to use it. (You can change the command line option to cortex-m3,m4,m33,etc. and see what you get for both functions).
So as I suggested, spend your time getting the high level code to be both readable and reliable and let the compiler handle optimization. In the case of formatting, there is little need to worry about perfect optimization.
At least ldiv is interesting enough to mention in this thread.
Absolutely!
Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?
Considering the abstract machine model the C standard is based on, I'd say C compilers have more opportunities to optimize the integer-divide-and-modulo expressions compared to the
div()/ldiv()/lldiv()/imaxdiv() function call.
If you use e.g. Compiler Explorer to examine the code generated at different optimization levels for different architectures from
typedef unsigned int uint;
typedef struct {
uint quotient;
unsigned char remainder;
} pair;
pair div10(uint n) { return (pair){ .quotient = n / 10, .remainder = n % 10 }; }
you'll see it varies depending on optimization options.
For example, for 8-bit AVR (ATmega32U4) using GCC 13.2.0 and
-Os, it compiles to a simple
__udivmodhi4 call. With
-O2, it is open-coded. For 32-bit architectures using GCC and Clang and
-O2, it typically compiles to a multiplication by 0xcccccccd ("0.1"); with
-Os, to a division by 10 (wherever the instruction set has a 32-bit unsigned divide instruction).
This does not mean that
/% produces
better code, though; only that the compiler has more opportunities for optimization.
I do not believe in micro-optimizations, or optimizing specific functions. Hot functions should get inlined, and there the opportunities for optimization tend to matter more. It is not just the operation itself, you see, but everything else surrounding and supporting the computation operation.
The best illustration of a similar issue I know of, is Radix-sorting IEEE 754 double-precision floating point numbers (
double). When their byte order is the same as unsigned integers, one only needs a preprocessing pass of flipping all bits if the sign bit is set, or only the sign bit if it is not set, to make all finite values sort in the same order as the 64-bit unsigned integer representation (after the bit flipping). Depending on the L1-cache/D-cache size, you do four to eight passes, plus the final unflipping-bits pass. This scales linearly: as it is a radix sort, it indeed sorts the finite floating-point values in
O(N) time complexity. However, its cache footprint is horrible, evicting basically everything else from the L1/D-cache, so using it
slows all other code running after the sort, because it ends up having to refill the cache. Thus, you need to have hundreds of millions of numbers or more, before it ends up being faster/better than QuickSort and other
O(N²) and
O(N log
N) time complexity (but typically fast) sorting algorithms. And if the other stuff is cache-sensitive, maybe not even then; while the sort may be done faster, the other code needed to do the actual real-world task may take correspondingly longer time (due to the cache misses). (Similarly, when multiplying a pair of dense matrices, you don't need very large matrices before it becomes worthwhile to copy the data so that they are both accessed in as compact order as possible, for maximum cache locality.)
Is there an actual problem that's trying to be solved here?
Not really. Binary to decimal conversion is considered "slow" and requiring "surprisingly much" machine-level code, compared to arithmetic operations, so these can be useful in some cases, but that's about it.
Library implementations, especially various
printf() implementations, are written for correctness and not for efficiency, so when you work with smaller microcontrollers, especially 8-bit ones (AVRs, PICs, 8051s), a custom implementation can be useful. More annoyingly, most standard library functions use arbitrary precision arithmetic and dynamic memory allocation during conversion of floating-point numbers.
Even on fully hosted C environments, it turns out that if you read numeric data from text files, the standard C library string-to-number conversions (
strtod(),
strtol(),
scanf() and their variants) become the bottleneck, when you have enough (megabytes) of data. Then, too, "optimized" conversion functions can reduce the load times to a fraction of what they would be using standard C conversion functions.