Routines to convert binary to BCD in C code

Routines to convert binary to BCD in C code
Posted by Picuino on 27 Mar, 2024 12:37
Binary to decimal BCD conversion routines have many applications. The first application I give them in new systems is to send by UART binary numbers to the computer in decimal format.
The code is written in C for Arduino, but is easily translatable to another environment.

16 bits binary to 5 digits BCD:

Code: (C) [Select]
#define BINARY_DIGITS 16 #define DECIMAL_DIGITS 5 void bin16_to_str(char *str, unsigned int bin) { char maxdig, digcnt, bitcnt; static char *p, carry; // Clear string p = str; digcnt = DECIMAL_DIGITS + 1; do { *p++ = 0; } while (--digcnt); // Transform binary to BCD bitcnt = BINARY_DIGITS; maxdig = (DECIMAL_DIGITS - 1); str += (DECIMAL_DIGITS - 1); do { // Shift left binary number with carry carry = 0; if (bin & (1L<<(BINARY_DIGITS-1))) carry |= 1; bin <<= 1; // Shift left decimal number with carry p = str; digcnt = DECIMAL_DIGITS - maxdig; do { carry = (*p<<1) + carry; if (carry >= 10) { *p-- = carry - 10; carry = 1; if (digcnt == 1) { maxdig--; digcnt++; } } else { *p-- = carry; carry = 0; } } while(--digcnt); } while(--bitcnt); // Transform BCD to ASCII digcnt = DECIMAL_DIGITS; do *str-- += '0'; while (--digcnt); } void setup() { Serial.begin(9600); } void loop() { unsigned long timeit; unsigned long bin; char strnum[DECIMAL_DIGITS + 1]; strnum[DECIMAL_DIGITS] = 0; // End of char timeit = millis(); for(bin=10000; bin<11000; bin++) { bin16_to_str(strnum, bin); } timeit = millis() - timeit; Serial.print("1 conversion in "); Serial.print(timeit); Serial.println(" microseconds"); delay(5000); for(bin=0; bin<50000; bin++) { bin16_to_str(strnum, bin); Serial.println(strnum); } }
Output:
Code: [Select]
1 Conversion in 61 microseconds 00000 00001 00002 00003 00004 00005 00006 00007 00008 00009 00010 00011 00012 00013 00014 00015 00016 00017 00018 00019 00020 00021 00022 00023 00024 00025 00026 00027 00028 00029 00030 00031 00032 00033 00034 00035 00036 00037 00038 00039 00040 00041 00042 00043 00044 00045 00046 00047 00048 00049 00050 00051 00052 00053 00054 00055 00056 00057 00058 00059 00060 00061 00062 00063 00064 00065 00066 00067 00068 00069 00070 00071 00072 00073 00074 00075 00076 00077 00078 00079 00080 00081 00082 00083 00084 00085 00086 00087 00088 00089 00090 00091 00092 00093 00094 00095 00096 00097 00098 00099 00100 00101 00102 00103 00104 00105 00106 00107 00108 00109 00110 00111 00112 00113 00114 00115 00116 00117 00118 00119 00120 00121 00122 00123 00124 00125 00126 00127 00128 00129 00130 00131 00132 00133 00134 00135 00136 00137 00138 00139 00140
Size of flash memory (function bin16_to_str) = 2084 - 1916 = 168 Bytes

=======================================================

32 bits binary to 10 digits BCD:

Code: (C) [Select]
#define BINARY_DIGITS 32 #define DECIMAL_DIGITS 10 void bin32_to_str(char *str, unsigned long bin) { char maxdig, digcnt, bitcnt; static char *p, carry; // Clear string p = str; digcnt = DECIMAL_DIGITS + 1; do { *p++ = 0; } while (--digcnt); // Transform binary to BCD bitcnt = BINARY_DIGITS; maxdig = (DECIMAL_DIGITS - 1); str += (DECIMAL_DIGITS - 1); do { // Shift left binary number with carry carry = 0; if (bin & (1L<<(BINARY_DIGITS-1))) carry |= 1; bin <<= 1; // Shift left decimal number with carry p = str; digcnt = DECIMAL_DIGITS - maxdig; do { carry = (*p<<1) + carry; if (carry >= 10) { *p-- = carry - 10; carry = 1; if (digcnt == 1) { maxdig--; digcnt++; } } else { *p-- = carry; carry = 0; } } while(--digcnt); } while(--bitcnt); // Transform BCD to ASCII digcnt = DECIMAL_DIGITS; do *str-- += '0'; while (--digcnt); } void setup() { Serial.begin(9600); } void loop() { unsigned long timeit; unsigned long bin; char strnum[DECIMAL_DIGITS + 1]; strnum[DECIMAL_DIGITS] = 0; // End of char timeit = millis(); for(bin=0x80000000; bin<0x80000000+1000; bin++) { bin32_to_str(strnum, bin); } timeit = millis() - timeit; Serial.print("1 Conversion in "); Serial.print(timeit); Serial.println(" microseconds"); delay(5000); for(bin=0; bin<100000; bin++) { bin32_to_str(strnum, bin); Serial.println(strnum); } }
Output:
Code: [Select]
1 Conversion in 234 microseconds 0000000000 0000000001 0000000002 0000000003 0000000004 0000000005 0000000006 0000000007 0000000008 0000000009 0000000010 0000000011 0000000012 0000000013 0000000014 0000000015 0000000016 0000000017 0000000018 0000000019 0000000020 0000000021 0000000022 0000000023 0000000024 0000000025 0000000026 0000000027 0000000028 0000000029 0000000030 0000000031 0000000032 0000000033 0000000034 0000000035 0000000036 0000000037 0000000038 0000000039 0000000040 0000000041 0000000042 0000000043 0000000044 0000000045 0000000046 0000000047 0000000048 0000000049 0000000050 0000000051 0000000052 0000000053 0000000054 0000000055
Size of flash memory (function bin32_to_str)= 2154 - 1926 = 228 Bytes

#1 Reply
Posted by Picuino on 27 Mar, 2024 13:02
8 bits binary to 3 digits BCD:

Code: [Select]
#define BINARY_DIGITS 8 #define DECIMAL_DIGITS 3 void bin8_to_str(char *str, unsigned char bin) { char maxdig, digcnt, bitcnt; static char *p, carry; // Clear string p = str; digcnt = DECIMAL_DIGITS + 1; do { *p++ = 0; } while (--digcnt); // Transform binary to BCD bitcnt = BINARY_DIGITS; maxdig = (DECIMAL_DIGITS - 1); str += (DECIMAL_DIGITS - 1); do { // Shift left binary number with carry carry = 0; if (bin & (1L<<(BINARY_DIGITS-1))) carry |= 1; bin <<= 1; // Shift left decimal number with carry p = str; digcnt = DECIMAL_DIGITS - maxdig; do { carry = (*p<<1) + carry; if (carry >= 10) { *p-- = carry - 10; carry = 1; if (digcnt == 1) { maxdig--; digcnt++; } } else { *p-- = carry; carry = 0; } } while(--digcnt); } while(--bitcnt); // Transform BCD to ASCII digcnt = DECIMAL_DIGITS; do *str-- += '0'; while (--digcnt); } void setup() { Serial.begin(9600); } void loop() { unsigned long timeit; unsigned char bin; char strnum[DECIMAL_DIGITS + 1]; strnum[DECIMAL_DIGITS] = 0; // End of char timeit = millis(); for(bin=0; bin<255; bin++) { bin8_to_str(strnum, bin); } timeit = millis() - timeit; Serial.print("1 conversion in "); Serial.print(timeit/2.55); Serial.println(" microseconds"); delay(5000); for(bin=0; bin<255; bin++) { bin8_to_str(strnum, bin); Serial.println(strnum); } }

Code: [Select]
1 conversion in 2.75 microseconds 000 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254

#2 Reply
Posted by tridac on 27 Mar, 2024 15:05
This is one of my standrd library functions. Edit as required...

/* Convert Binary to Bcd, Msd First */
/* -------------------------------- */
void vBin2Bcd (U32 u32Value, U8 *pu8BcdArray, U8 u8Count) { /* BcdArray has msd at array[0] on return */

U8 u8Index;

if (u32Value > 0) {
for (u8Index = 1; u8Index <= u8Count; u8Index++) {
pu8BcdArray [u8Count - u8Index] = (U8) (u32Value % 10); /* Buffer remainder */
u32Value /= 10; /* Next decade */
}
}

else { /* Don't convert zero */
for (u8Index = 0; u8Index < u8Count; u8Index++) {
pu8BcdArray [u8Index] = 0; /* Clear output */
}
}
}

A printing version:

/* Convert and Display 32 Bit Value as Decimal */
/* ------------------------------------------- */
void vPutDecimal (U32 u32Value) {

U8 u8Count = BUF_SIZE - 1;

u8Buffer [u8Count--] = 0x00; /* Terminate string */

do { /* First, split number into individual digits */
u8Buffer [u8Count--] = (U8) ((u32Value % 10) + 0x30); /* Get remainder, asciify & buffer */
} while ((u32Value /= 10) > 0 && u8Count > 0); /* Next decade, trap buffer overflow */

u8Count++; /* Correct count */
vPutString (&u8Buffer [u8Count]); /* Display, left justified */
}

#3 Reply
Posted by Picuino on 27 Mar, 2024 15:30
Quote from: tridac on 27 Mar, 2024 15:05
This is one of my standrd library functions. Edit as required...

/* Convert Binary to Bcd, Msd First */
/* -------------------------------- */
void vBin2Bcd (U32 u32Value, U8 *pu8BcdArray, U8 u8Count) { /* BcdArray has msd at array[0] on return */

U8 u8Index;

if (u32Value > 0) {
for (u8Index = 1; u8Index <= u8Count; u8Index++) {
pu8BcdArray [u8Count - u8Index] = (U8) (u32Value % 10); /* Buffer remainder */
u32Value /= 10; /* Next decade */
}
}

else { /* Don't convert zero */
for (u8Index = 0; u8Index < u8Count; u8Index++) {
pu8BcdArray [u8Index] = 0; /* Clear output */
}
}
}

Good and very short and simple routine.

My rutine, on the other hand, it does not make use of integer arithmetic ( value%10 and value /= 10) it only makes use of binary number shifts.
This means that it can generate shorter and faster assembly code, especially on small micros that do not have large ALUs nor FPUs.

#4 Reply
Posted by Picuino on 27 Mar, 2024 15:41
My routine of 32 bits use at Arduino: 228 bytes of flash, 234us/conversion

Other routine of 32 bits use at Arduino: 156 bytes of flash, 380us/conversion

It seems that in the end the routine with divisions by 10 is shorter, despite the integer arithmetic required.

#5 Reply
Posted by tridac on 27 Mar, 2024 16:03
Iirc, originally used that sort of idea in 6502 asm, but yes, like the old debate around integer divide and multiply, the old shift and add / subtract method can save a lot of memory, math lib inclusion etc. Modern micros are not so encumbered though, orders or magnitude faster, perhaps integer multiply and divide, and more flash space, so tend to think clarity rather than memory economy.

#6 Reply
Posted by radiolistener on 27 Mar, 2024 17:31
Here is simple 32-bit binary to 8-digit BCD (32-bit) conversion:
Code: [Select]
uint32_t bin2bcd(uint32_t bin) { uint32_t bcd = 0; uint32_t factor = 1; while (bin) { bcd += (bin % 10) * factor; bin /= 10; factor <<= 4; } return bcd; }
just wonder how much memory it will consume on Arduino and what is execution time in comparison to previously posted code.

Does your microcontroller supports hardware multiplication and divide for integer?

#7 Reply
Posted by Picuino on 27 Mar, 2024 19:37
32 bits binary to 10 digits BCD (with division value/10):
Code: [Select]
#define DECIMAL_DIGITS 10 void bin32_to_str(char *bcdArray, unsigned long value) { unsigned char index; if (value > 0) { for (index = 1; index <= DECIMAL_DIGITS; index++) { bcdArray[DECIMAL_DIGITS - index] = ((unsigned char) (value % 10)) + '0'; value /= 10; } } else { for (index = 0; index < DECIMAL_DIGITS; index++) { bcdArray[index] = '0'; } } } void setup() { Serial.begin(9600); } void loop() { unsigned long timeit; unsigned long bin; char strnum[DECIMAL_DIGITS + 1]; strnum[DECIMAL_DIGITS] = 0; // End of char timeit = millis(); for (bin = 0x80000000; bin < 0x80000000 + 1000; bin++) { bin32_to_str(strnum, bin); } timeit = millis() - timeit; Serial.print("1 Conversion in "); Serial.print(timeit); Serial.println(" microseconds"); delay(5000); for (bin = 0; bin < 100000; bin++) { bin32_to_str(strnum, bin); Serial.println(strnum); } }
Code: [Select]
1 Conversion in 386 microseconds 0000000000 0000000001 0000000002 0000000003 0000000004 0000000005 0000000006 0000000007 0000000008 0000000009 0000000010 0000000011 0000000012 0000000013 0000000014 0000000015 0000000016 0000000017 0000000018 0000000019 0000000020 0000000021 0000000022 0000000023 0000000024 0000000025 0000000026 0000000027 0000000028 0000000029 0000000030 0000000031 0000000032 0000000033 0000000034 0000000035 0000000036 0000000037 0000000038 0000000039 0000000040 0000000041 0000000042 0000000043 0000000044 0000000045 0000000046 0000000047 0000000048 0000000049 0000000050 0000000051 0000000052 0000000053 0000000054 0000000055 0000000056 0000000057 0000000058 0000000059 0000000060 0000000061 0000000062 0000000063 0000000064 0000000065 0000000066 0000000067 0000000068 0000000069 0000000070 0000000071 0000000072 0000000073 0000000074 0000000075 0000000076 0000000077 0000000078 0000000079 0000000080 0000000081 0000000082 0000000083 0000000084 0000000085 0000000086 0000000087 0000000088 0000000089 0000000090 0000000091 0000000092 0000000093 0000000094 0000000095 0000000096 0000000097 0000000098 0000000099 0000000100 0000000101 0000000102 0000000103 0000000104 0000000105 0000000106 0000000107 0000000108 0000000109 0000000110 0000000111 0000000112 0000000113 0000000114 0000000115 0000000116 0000000117 0000000118 0000000119 0000000120 0000000121 0000000122 0000000123 0000000124 0000000125 0000000126 0000000127 0000000128 0000000129 0000000130 0000000131 0000000132 0000000133 0000000134 0000000135 0000000136 0000000137 0000000138 0000000139 0000000140 0000000141 0000000142 0000000143 0000000144 0000000145 0000000146 0000000147 0000000148 0000000149 0000000150 0000000151 0000000152 0000000153 0000000154 0000000155 0000000156 0000000157 0000000158 0000000159 0000000160 0000000161 0000000162 0000000163 0000000164 0000000165 0000000166 0000000167 0000000168 0000000169 0000000170 0000000171 0000000172 0000000173 0000000174
Total flash memory code (function bin32_to_str): 2092-1926 = 166 bytes

#8 Reply
Posted by Nominal Animal on 27 Mar, 2024 19:41
For 8-bit microcontrollers like AVRs, I'd suggest repeated substraction instead. For example,
Code: [Select]
// SPDX-License-Identifier: CC0-1.0 #include <stdint.h> static const uint32_t u32_dec[20] = { UINT32_C(1), UINT32_C(3), UINT32_C(10), UINT32_C(30), UINT32_C(100), UINT32_C(300), UINT32_C(1000), UINT32_C(3000), UINT32_C(10000), UINT32_C(30000), UINT32_C(100000), UINT32_C(300000), UINT32_C(1000000), UINT32_C(3000000), UINT32_C(10000000), UINT32_C(30000000), UINT32_C(100000000), UINT32_C(300000000), UINT32_C(1000000000), UINT32_C(3000000000), }; // Little-endian byte order #define BYTEINDEX(n, m) (n) // Big-endian byte order // #define BYTEINDEX(n, m) ((m)-(n)) // Little-endian byte order static const uint8_t u32_bcd[20][2] = { { BYTEINDEX(0, 4), 0x01 }, { BYTEINDEX(0, 4), 0x03 }, { BYTEINDEX(0, 4), 0x10 }, { BYTEINDEX(0, 4), 0x30 }, { BYTEINDEX(1, 4), 0x01 }, { BYTEINDEX(1, 4), 0x03 }, { BYTEINDEX(1, 4), 0x10 }, { BYTEINDEX(1, 4), 0x30 }, { BYTEINDEX(2, 4), 0x01 }, { BYTEINDEX(2, 4), 0x03 }, { BYTEINDEX(2, 4), 0x10 }, { BYTEINDEX(2, 4), 0x30 }, { BYTEINDEX(3, 4), 0x01 }, { BYTEINDEX(3, 4), 0x03 }, { BYTEINDEX(3, 4), 0x10 }, { BYTEINDEX(3, 4), 0x30 }, { BYTEINDEX(4, 4), 0x01 }, { BYTEINDEX(4, 4), 0x03 }, { BYTEINDEX(4, 4), 0x10 }, { BYTEINDEX(4, 4), 0x30 }, }; void binary32_to_bcd(uint8_t dst[5], uint32_t src) { dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = 0; int8_t i = 20; while (i-->0) while (src >= u32_dec[i]) { src -= u32_dec[i]; dst[u32_bcd[i][0]] += u32_bcd[i][1]; } }takes about 258 bytes of Flash on AVR (gcc-5.4.0 -O2 -mmcu=atmega32u4), with the slowest case being 2888888888, 38 iterations of the inner loop total.

If you don't mind increasing the slowest case to 3999999999, 85 iterations of the inner loop total, you can use
Code: [Select]
// SPDX-License-Identifier: CC0-1.0 #include <stdint.h> static const uint32_t u32_dec[10] = { UINT32_C(1), UINT32_C(10), UINT32_C(100), UINT32_C(1000), UINT32_C(10000), UINT32_C(100000), UINT32_C(1000000), UINT32_C(10000000), UINT32_C(100000000), UINT32_C(1000000000), }; // Little-endian byte order #define BYTEINDEX(n, m) (n) // Big-endian byte order // #define BYTEINDEX(n, m) ((m)-(n)) // Little-endian byte order static const uint8_t u32_bcd[10][2] = { { BYTEINDEX(0, 4), 0x01 }, { BYTEINDEX(0, 4), 0x10 }, { BYTEINDEX(1, 4), 0x01 }, { BYTEINDEX(1, 4), 0x10 }, { BYTEINDEX(2, 4), 0x01 }, { BYTEINDEX(2, 4), 0x10 }, { BYTEINDEX(3, 4), 0x01 }, { BYTEINDEX(3, 4), 0x10 }, { BYTEINDEX(4, 4), 0x01 }, { BYTEINDEX(4, 4), 0x10 }, }; void binary32_to_bcd(uint8_t dst[5], uint32_t src) { dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = 0; int8_t i = sizeof u32_dec / sizeof u32_dec[0]; while (i-->0) while (src >= u32_dec[i]) { src -= u32_dec[i]; dst[u32_bcd[i][0]] += u32_bcd[i][1]; } }which compiles to 198 bytes of flash (gcc-5.4.0, -O2 -mmcu=atmega32u4).

#9 Reply
Posted by cv007 on 27 Mar, 2024 20:48
Quote
The first application I give them in new systems is to send by UART binary numbers to the computer in decimal format.
The first thing I get any new mcu to do is to use the pins, then use the uart via printf or similar. Once in place you are set for any kind of formatting you want, with no need to create bits and pieces of code that attempts to do the same. The size, once in place, remains the same so the cost is one time and 'free' to use after the first use so use it everywhere you can.

https://godbolt.org/z/cMffYMzf4
A little over 2k in size, of which printf is probably 3/4 of that. If my avr couldn't handle the extra 1.5k, I would move up to the next avr flash size. Having this is place is much more valuable than the loss of 1.5k (of which you replace with your bits/pieces of code anyway, so more like 1k).

I also have a C++ header for cout style printing, and the example above would be like so (and be about 500 bytes smaller in size)-
Code: [Select]
uart, '[', dec_(10,millis()), "] i: ", i++, endl; // using , instead of << and dec_ is decimal with width of 10 and leading spacesa header with about 500 lines of code, but is basically 2 relatively simple functions that do the work (excluding floating point, which is another function)- format a number to something readable using already set formatting options and output a string, again using width/align options already in place (everything ends up in this).

Unless you like the challenge of fitting max code into a small mcu (like 2k), just get a printf in place and output whatever you desire and you can spend more time doing things like learning printf syntax.

#10 Reply
Posted by SiliconWizard on 27 Mar, 2024 23:34
Note that unless you have to deal with a lot of digits, on small targets, a very simple approach is often not just much simpler, but also actually faster.
You just need to count each digit by powers of ten. It's a very crude division algorithm, but for this purpose it works quite well. Nominal showed an improved version of that, but one that will take less flash (likely) is just to directly count in decimal.
If you just need a few digits, no need to get more complicated than this. You just need an array of the powers of 10 corresponding to the number of digits you need to deal with. It can even be unrolled if the number of digit is small. Say for 4 digits:

nDigit1000 = 0;
while (n >= 1000)
{
nDigit1000++;
n -= 1000;
}

nDigit100 = 0;
while (n >= 100)
{
nDigit100++;
n -= 100;
}

... and so on. If the number of digits is greater than this, or you want something more compact and easier to scale up, you can use arrays instead of this loop unrolling.
You can use Nominal's approach for something a bit more efficient.

But the direct decimal subtraction is often the simplest and fastest one on small targets that don't even have a hardware multiplier (or that have one that's pretty slow) with which optimizing compilers will usually implement the dvision by 10, if you go for the mod 10, divide by 10 approach. It may look more appropriate, but it rarely is on small embedded targets.

#11 Reply
Posted by Nominal Animal on 28 Mar, 2024 06:54
There are several threads on related interesting topics in the Programming child board; just browse the thread titles for a few dozen pages past.

#12 Reply
Posted by rhodges on 04 Apr, 2024 02:48
For every architecture, I write binary to BCD and ASCII in the native assember. After three decades, I don't find the idea too hard. My approach is to use or emulate the DAA. I recently posted my code on the thread called "comments".

I consider the divide a "no go" unless the CPU has a quick hardware divide. For example, the STM8 has a hardware divide that makes the conversion fast for small numbers. Otherwise, I suggest the shift and "DAA" method.

#13 Reply
Posted by T3sl4co1l on 04 Apr, 2024 05:04
Have probably posted this before somewhere in the past, but as long as it's topical...

You can divide by multiplying by a reciprocal, assuming the reciprocal is known of course. This even allows obtaining the remainder.

Something I wrote years ago:

Code: [Select]
; ; uDivTen ; ; Unsigned division by ten. 45 words, takes 60 cycles (plus rcall). ; ; Input: ; r17:r16 = operand ; Output: ; r1:r0 = quotient ; r3:r2 = remainder ; (r16 = 10, r17 = 0) ; uDivTen: push r7 ; register usage: push r6 ; r7:r6 = operand (in[0..1]) push r5 ; r5:r4:r3:r2 = 40 bit accumulator, D[1..3] (byte 0 discarded) push r4 ; r1:r0 = partial products movw r6, r16 ; r16 = temp, r17 = zero clr r17 ; (but first, save operand) ; multiply by K = 2^20 * 16 / 10 to make D[0..3] = in[0..1] * K[0..2] ; (implicitly discarding D[0]) ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16 mul r7, r16 ; in[1] * K[2] movw r4, r0 ; save high word ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 0)) >> 0 mul r7, r16 ; in[1] * K[0] movw r2, r0 ; save low word mul r6, r16 ; in[0] * K[0] add r2, r1 ; accumulate to D[1] (discard lowest byte) adc r3, r17 adc r4, r17 adc r5, r17 ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 8)) >> 8 mul r6, r16 ; in[0] * K[1] add r2, r0 ; accumulate to D[1..2] adc r3, r1 adc r4, r17 adc r5, r17 mul r7, r16 ; in[1] * K[1] add r3, r0 ; accumulate to D[2..3] adc r4, r1 adc r5, r17 ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16 mul r6, r16 ; in[0] * K[2] add r3, r0 ; accumulate to D[2..3] adc r4, r1 adc r5, r17 ; dig remainder out of the fractional part ldi r16, 0x10 ; rounding bit add r3, r16 ldi r16, 10 mul r3, r16 ; frac * 10 mov r2, r1 clr r3 movw r0, r4 ; quotient out ; r3 = 0, r2 = [0...9], r1:r0 = [0...6553] pop r4 pop r5 pop r6 pop r7 ret ; END PROC uDivTen
Using an ad-hoc / non-standard ABI, but that's easily adapted by those in the know. That "40 bit" accumulator in the middle seems intimidating, or over-the-top, but IIRC, one or two extra bits are required to complete the full result (including remainder) and making use of the register is most likely better than any bit twiddling hacks you might do otherwise. (And, good luck getting the compiler to generate that..!)

I've never cared for BCD, and just do the x / 10, x % 10 loop to generate decimals. You'd have to be running a lot of digits to need any kind of speed. You're most likely doing it in the main() loop at a low refresh rate, for which any method will do -- my most recent codebases just let the C compiler figure it out, or use a library call such as itoa(). I haven't ran out of Flash or SRAM in a long time (...or ever?), not that I do very large projects, but still, a typical 64k AVR can hold a lot of crufty code no problem.

Tim

#14 Reply
Posted by bson on 08 Apr, 2024 21:10
For ARM, shift-and-add/subtract is probably the most efficient since it can do both in a single instruction.

I have a small utility at https://github.com/bson/divfac which factors unsigned integer division by a constant into shift-and-add. For example:

Code: [Select]
$ divfac.py 10 div10 template <typename T> static inline T div10(const T& val) { return (val >> 3) - (val >> 5) + (val >> 7) - (val >> 9) + (val >> 11) - (val >> 13) + (val >> 15) - (val >> 17) + (val >> 19) - (val >> 21) + (val >> 23) - (val >> 25) + (val >> 27) - (val >> 29) + (val >> 31); }
Obviously it spits out C++ in a way to allow running it from makefiles to easily generate header files to be included for common divisors.

There's clearly a pattern here that would make it easy to manually extend to 64 bits, but personally I've never had a use for that. For smaller T's like uint16_t, many of the terms are 0 and get eliminated during optimization.

With inline ARM assembly, a 32-bit divide-by-10 could look something like:

Code: [Select]
#include <stdint.h> static inline uint32_t div10(const uint32_t value) { uint32_t result; asm("lsr %0, %1, #3\n\t" "sub %0, %1, lsr #5\n\t" "add %0, %1, lsr #7\n\t" "sub %0, %1, lsr #9\n\t" "add %0, %1, lsr #11\n\t" "sub %0, %1, lsr #13\n\t" "add %0, %1, lsr #15\n\t" "sub %0, %1, lsr #17\n\t" "add %0, %1, lsr #19\n\t" "sub %0, %1, lsr #21\n\t" "add %0, %1, lsr #23\n\t" "sub %0, %1, lsr #25\n\t" "add %0, %1, lsr #27\n\t" "sub %0, %1, lsr #29\n\t" "add %0, %1, lsr #31" : "+r" (result) : "r" (value)); return result; }
Of course, a 16-bit divide would be shorter since half the terms can be removed.

#15 Reply
Posted by Doctorandus_P on 16 Apr, 2024 13:31
I would use ldiv( ) It does a division, and returns both the answer, and the remainer in a structure. You can also use a standard function such as itoa( ) (Integer to Ascii). Also studying the code in the standard libraries (from an Open Source compiler such as GCC) can be an excellent educational tool for leaning different types of algorithms used in programming.

Below a function I wrote years ago. Most of it is house keeping and ascii formatting (sings, dots, etc) to convert an integer into a fixed point interpretated ascii string. The while loop in the middle does the actual conversion with ldiv( ). Do note it writes the result "backwards" to the destination string (array).

// String Print a number with Fixed width and precision (decimal point). // Signed version. char* SPrintFixed( char* Buf, int32_t n, int8_t length, int8_t Precision) { char Sign, *pEnd = Buf + length; ldiv_t Number; if( n < 0) { n = -n; Sign = '-'; } else Sign = ' '; Buf[ length] = 0x00; Number.quot = n; do { Number = ldiv( Number.quot, 10); Buf[--length] = Number.rem + '0'; if( --Precision == 0) Buf[--length] = '.'; }while( (length > 0) && ((Number.quot) || (Precision >= 0)) ); if( Sign == '-') { if( length > 0) { Buf[--length] = '-'; } } while( length > 0) { // Fill remaining space with spaces. Buf[--length] = ' '; } return pEnd; }

#16 Reply
Posted by cv007 on 16 Apr, 2024 18:06
Quote
I would use ldiv( )
You will be better off letting the compiler handle it. It can figure out when you are needing quotient/remainder and then call a divmod library function which is already optimized for the particular mcu. Forcing the use of ldiv when the compiler could have done something better (much better depending on mcu) means you are crippling the more capable mcu's.

So like most things, write code that is clear and easy to understand and let the compiler optimize, which it does quite well.

#17 Reply
Posted by Doctorandus_P on 23 Apr, 2024 02:46
Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?
And how do you know that ldiv is not optimized?

Saying this as a blanket statement is not very trustworthy. Modern compilers do weird things to code during optimisation, and if performance really is that important you should do some benchmarking on the uC / compiler you use.

And as for readability, I'd say my posted code is very readable, but I'm biased of course.
At least ldiv is interesting enough to mention in this thread.

#18 Reply
Posted by cv007 on 24 Apr, 2024 19:18
Quote
Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?
I mean compiler writers are not unaware of these things and may already generate optimal code given simple directions in the form of user code. In the case of an mcu with hardware division, letting the compiler decide means you will be letting the compiler use the hardware division instead of forcing a call to an ldiv library function, or some other user created division function, which will be much slower and therefore defeats the purpose of the extra effort to optimize on your own.

Simple example for an avr-
https://godbolt.org/z/TsvWeTof4
They both make a single call to __divmodhi4 and the compiler sees the need for both q/r in the 'normal' version even though the div/mod statements were separated by code. Writing straight forward code in this case results in (almost) exactly the same code.

A second example, but for a cortex-m
https://godbolt.org/z/GEbY8n1nn
In this case, the compiler for whatever reason needs the divisor in a global var before it will do only the single __aeabi_idivmod call, else will get an additional __aeabi_idiv call. Even if you do not know this quirk and end up with 2 division calls its not something to worry about- you will not notice any effects that your formatting code is not optimal. Once you move over to an M3 and above, the compiler will start using hardware division and you had to do nothing to get that, where the ldiv (or any custom code) user will still be using the slow library call as the compiler will be forced to use it. (You can change the command line option to cortex-m3,m4,m33,etc. and see what you get for both functions).

So as I suggested, spend your time getting the high level code to be both readable and reliable and let the compiler handle optimization. In the case of formatting, there is little need to worry about perfect optimization.

#19 Reply
Posted by Nominal Animal on 24 Apr, 2024 20:42
Quote from: Doctorandus_P on 23 Apr, 2024 02:46
At least ldiv is interesting enough to mention in this thread.
Absolutely!

Quote from: Doctorandus_P on 23 Apr, 2024 02:46
Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?
Considering the abstract machine model the C standard is based on, I'd say C compilers have more opportunities to optimize the integer-divide-and-modulo expressions compared to the div()/ldiv()/lldiv()/imaxdiv() function call.

If you use e.g. Compiler Explorer to examine the code generated at different optimization levels for different architectures from
Code: [Select]
typedef unsigned int uint; typedef struct { uint quotient; unsigned char remainder; } pair; pair div10(uint n) { return (pair){ .quotient = n / 10, .remainder = n % 10 }; }you'll see it varies depending on optimization options.

For example, for 8-bit AVR (ATmega32U4) using GCC 13.2.0 and -Os, it compiles to a simple __udivmodhi4 call. With -O2, it is open-coded. For 32-bit architectures using GCC and Clang and -O2, it typically compiles to a multiplication by 0xcccccccd ("0.1"); with -Os, to a division by 10 (wherever the instruction set has a 32-bit unsigned divide instruction).

This does not mean that /% produces better code, though; only that the compiler has more opportunities for optimization.

I do not believe in micro-optimizations, or optimizing specific functions. Hot functions should get inlined, and there the opportunities for optimization tend to matter more. It is not just the operation itself, you see, but everything else surrounding and supporting the computation operation.

The best illustration of a similar issue I know of, is Radix-sorting IEEE 754 double-precision floating point numbers (double). When their byte order is the same as unsigned integers, one only needs a preprocessing pass of flipping all bits if the sign bit is set, or only the sign bit if it is not set, to make all finite values sort in the same order as the 64-bit unsigned integer representation (after the bit flipping). Depending on the L1-cache/D-cache size, you do four to eight passes, plus the final unflipping-bits pass. This scales linearly: as it is a radix sort, it indeed sorts the finite floating-point values in O(N) time complexity. However, its cache footprint is horrible, evicting basically everything else from the L1/D-cache, so using it slows all other code running after the sort, because it ends up having to refill the cache. Thus, you need to have hundreds of millions of numbers or more, before it ends up being faster/better than QuickSort and other O(N²) and O(N log N) time complexity (but typically fast) sorting algorithms. And if the other stuff is cache-sensitive, maybe not even then; while the sort may be done faster, the other code needed to do the actual real-world task may take correspondingly longer time (due to the cache misses). (Similarly, when multiplying a pair of dense matrices, you don't need very large matrices before it becomes worthwhile to copy the data so that they are both accessed in as compact order as possible, for maximum cache locality.)

#20 Reply
Posted by T3sl4co1l on 25 Apr, 2024 04:58
Quote from: Nominal Animal on 24 Apr, 2024 20:42
The best illustration of a similar issue I know of, is Radix-sorting IEEE 754 double-precision floating point numbers (double). When their byte order is the same as unsigned integers, one only needs a preprocessing pass of flipping all bits if the sign bit is set, or only the sign bit if it is not set, to make all finite values sort in the same order as the 64-bit unsigned integer representation (after the bit flipping). Depending on the L1-cache/D-cache size, you do four to eight passes, plus the final unflipping-bits pass. This scales linearly: as it is a radix sort, it indeed sorts the finite floating-point values in O(N) time complexity. However, its cache footprint is horrible, evicting basically everything else from the L1/D-cache, so using it slows all other code running after the sort, because it ends up having to refill the cache. Thus, you need to have hundreds of millions of numbers or more, before it ends up being faster/better than QuickSort and other O(N²) and O(N log N) time complexity (but typically fast) sorting algorithms. And if the other stuff is cache-sensitive, maybe not even then; while the sort may be done faster, the other code needed to do the actual real-world task may take correspondingly longer time (due to the cache misses). (Similarly, when multiplying a pair of dense matrices, you don't need very large matrices before it becomes worthwhile to copy the data so that they are both accessed in as compact order as possible, for maximum cache locality.)

Related perspective shift: CPU time is essentially free these days, and often this is the case on more historic platforms too (most anything with slow RAM, caches, etc.). Doing float comparisons might be relatively onerous in raw cycle count (depending on platform; they've certainly improved over the years), and the integer hack is so greedily tempting. But if you're spending all your time fetching and storing, that... doesn't matter at all.

Tim

#21 Reply
Posted by John Coloccia on 26 Apr, 2024 01:00
Depends on the processor. Some processors fall flat on their face with floating point calculations.

But yes, many cases of poor performance are due to IO bounds, not computation.

Is there an actual problem that's trying to be solved here?

#22 Reply
Posted by Nominal Animal on 26 Apr, 2024 02:15
Quote from: John Coloccia on 26 Apr, 2024 01:00
Is there an actual problem that's trying to be solved here?
Not really. Binary to decimal conversion is considered "slow" and requiring "surprisingly much" machine-level code, compared to arithmetic operations, so these can be useful in some cases, but that's about it.

Library implementations, especially various printf() implementations, are written for correctness and not for efficiency, so when you work with smaller microcontrollers, especially 8-bit ones (AVRs, PICs, 8051s), a custom implementation can be useful. More annoyingly, most standard library functions use arbitrary precision arithmetic and dynamic memory allocation during conversion of floating-point numbers.

Even on fully hosted C environments, it turns out that if you read numeric data from text files, the standard C library string-to-number conversions (strtod(), strtol(), scanf() and their variants) become the bottleneck, when you have enough (megabytes) of data. Then, too, "optimized" conversion functions can reduce the load times to a fraction of what they would be using standard C conversion functions.

#23 Reply
Posted by SiliconWizard on 26 Apr, 2024 03:39
Quote from: Nominal Animal on 26 Apr, 2024 02:15
Quote from: John Coloccia on 26 Apr, 2024 01:00
Is there an actual problem that's trying to be solved here?
Not really. Binary to decimal conversion is considered "slow" and requiring "surprisingly much" machine-level code, compared to arithmetic operations, so these can be useful in some cases, but that's about it.

Library implementations, especially various printf() implementations, are written for correctness and not for efficiency, so when you work with smaller microcontrollers, especially 8-bit ones (AVRs, PICs, 8051s), a custom implementation can be useful. More annoyingly, most standard library functions use arbitrary precision arithmetic and dynamic memory allocation during conversion of floating-point numbers.

Even on fully hosted C environments, it turns out that if you read numeric data from text files, the standard C library string-to-number conversions (strtod(), strtol(), scanf() and their variants) become the bottleneck, when you have enough (megabytes) of data. Then, too, "optimized" conversion functions can reduce the load times to a fraction of what they would be using standard C conversion functions.

Yep. scanf() being the absolute dog of them all.

And sometimes the "simplest" solution is actually also the most efficient. As I said before, the simple approach to count powers of ten, requiring "only" up to 9 iterations per digit with just a subtraction and an increment per iteration, will be faster on most targets that do not have a hardware multiplier (with which most optimizing compiler will implement the divide by 10), or on which the hardware multiplier is a multi-cycle operation. In particular, I've used the solution I mentioned earlier on AVR targets and it led to a lot smaller code size, and much faster too. Don't let the fact that it looks too "naive" deter you - looking at the generated assembly if in doubt will help picking the most efficient approach, in simple cases like this.

#24 Reply
Posted by Picuino on 26 Apr, 2024 11:49
Performing the binary-to-decimal conversion with custom routines has helped me to be able to dump huge numbers (48 bits long) from the counter of an 8-bit microcontroller that counted 10MHz pulses for long periods of time and dumped the count by UART every second.