Author Topic: Routines to convert binary to BCD in C code (Read 2887 times)

Picuino · « **on:** March 27, 2024, 12:37:41 pm »

Binary to decimal BCD conversion routines have many applications. The first application I give them in new systems is to send by UART binary numbers to the computer in decimal format.
The code is written in C for Arduino, but is easily translatable to another environment.

16 bits binary to 5 digits BCD:

Code: (C) [Select]

#define BINARY_DIGITS   16
#define DECIMAL_DIGITS  5


void bin16_to_str(char *str, unsigned int bin) {
   char maxdig, digcnt, bitcnt;
   static char *p, carry;
 
   // Clear string
   p = str;
   digcnt = DECIMAL_DIGITS + 1;
   do { *p++ = 0; } while (--digcnt);
 
   // Transform binary to BCD
   bitcnt = BINARY_DIGITS;
   maxdig = (DECIMAL_DIGITS - 1);
   str += (DECIMAL_DIGITS - 1);
   do {
      // Shift left binary number with carry
      carry = 0;
      if (bin & (1L<<(BINARY_DIGITS-1)))
         carry |= 1;
      bin <<= 1;

      // Shift left decimal number with carry
      p = str;
      digcnt = DECIMAL_DIGITS - maxdig;
      do {
         carry = (*p<<1) + carry;
         if (carry >= 10) {
            *p-- = carry - 10;
            carry = 1;
            if (digcnt == 1) {
               maxdig--;
               digcnt++;
            }
         }
         else {
            *p-- = carry;
            carry = 0;
         }
      } while(--digcnt);
   } while(--bitcnt);

   // Transform BCD to ASCII
   digcnt = DECIMAL_DIGITS;
   do *str-- += '0'; while (--digcnt);
}


void setup() {
   Serial.begin(9600);
}


void loop() {
   unsigned long timeit;
   unsigned long bin;
   char strnum[DECIMAL_DIGITS + 1];
   strnum[DECIMAL_DIGITS] = 0; // End of char

   timeit = millis();
   for(bin=10000; bin<11000; bin++) {
      bin16_to_str(strnum, bin);
   }
   timeit = millis() - timeit;
   Serial.print("1 conversion in ");
   Serial.print(timeit);
   Serial.println(" microseconds");
   delay(5000);
   
   for(bin=0; bin<50000; bin++) {
      bin16_to_str(strnum, bin);
      Serial.println(strnum);
   }
}

Output:

Code: [Select]

1 Conversion in 61 microseconds
00000
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140

Size of flash memory (function bin16_to_str) = 2084 - 1916 = 168 Bytes

=======================================================

32 bits binary to 10 digits BCD:

Code: (C) [Select]

#define BINARY_DIGITS   32
#define DECIMAL_DIGITS  10


void bin32_to_str(char *str, unsigned long bin) {
   char maxdig, digcnt, bitcnt;
   static char *p, carry;
 
   // Clear string
   p = str;
   digcnt = DECIMAL_DIGITS + 1;
   do { *p++ = 0; } while (--digcnt);
 
   // Transform binary to BCD
   bitcnt = BINARY_DIGITS;
   maxdig = (DECIMAL_DIGITS - 1);
   str += (DECIMAL_DIGITS - 1);
   do {
      // Shift left binary number with carry
      carry = 0;
      if (bin & (1L<<(BINARY_DIGITS-1)))
         carry |= 1;
      bin <<= 1;

      // Shift left decimal number with carry
      p = str;
      digcnt = DECIMAL_DIGITS - maxdig;
      do {
         carry = (*p<<1) + carry;
         if (carry >= 10) {
            *p-- = carry - 10;
            carry = 1;
            if (digcnt == 1) {
               maxdig--;
               digcnt++;
            }
         }
         else {
            *p-- = carry;
            carry = 0;
         }
      } while(--digcnt);
   } while(--bitcnt);

   // Transform BCD to ASCII
   digcnt = DECIMAL_DIGITS;
   do *str-- += '0'; while (--digcnt);
}


void setup() {
   Serial.begin(9600);
}


void loop() {
   unsigned long timeit;
   unsigned long bin;
   char strnum[DECIMAL_DIGITS + 1];
   strnum[DECIMAL_DIGITS] = 0; // End of char

   timeit = millis();
   for(bin=0x80000000; bin<0x80000000+1000; bin++) {
      bin32_to_str(strnum, bin);
   }
   timeit = millis() - timeit;
   Serial.print("1 Conversion in ");
   Serial.print(timeit);
   Serial.println(" microseconds");
   delay(5000);
   
   for(bin=0; bin<100000; bin++) {
      bin32_to_str(strnum, bin);
      Serial.println(strnum);
   }
}

Output:

Code: [Select]

1 Conversion in 234 microseconds
0000000000
0000000001
0000000002
0000000003
0000000004
0000000005
0000000006
0000000007
0000000008
0000000009
0000000010
0000000011
0000000012
0000000013
0000000014
0000000015
0000000016
0000000017
0000000018
0000000019
0000000020
0000000021
0000000022
0000000023
0000000024
0000000025
0000000026
0000000027
0000000028
0000000029
0000000030
0000000031
0000000032
0000000033
0000000034
0000000035
0000000036
0000000037
0000000038
0000000039
0000000040
0000000041
0000000042
0000000043
0000000044
0000000045
0000000046
0000000047
0000000048
0000000049
0000000050
0000000051
0000000052
0000000053
0000000054
0000000055

Size of flash memory (function bin32_to_str)= 2154 - 1926 = 228 Bytes

Picuino · « **Reply #1 on:** March 27, 2024, 01:02:09 pm »

8 bits binary to 3 digits BCD:

Code: [Select]

#define BINARY_DIGITS   8
#define DECIMAL_DIGITS  3


void bin8_to_str(char *str, unsigned char bin) {
   char maxdig, digcnt, bitcnt;
   static char *p, carry;
 
   // Clear string
   p = str;
   digcnt = DECIMAL_DIGITS + 1;
   do { *p++ = 0; } while (--digcnt);
 
   // Transform binary to BCD
   bitcnt = BINARY_DIGITS;
   maxdig = (DECIMAL_DIGITS - 1);
   str += (DECIMAL_DIGITS - 1);
   do {
      // Shift left binary number with carry
      carry = 0;
      if (bin & (1L<<(BINARY_DIGITS-1)))
         carry |= 1;
      bin <<= 1;

      // Shift left decimal number with carry
      p = str;
      digcnt = DECIMAL_DIGITS - maxdig;
      do {
         carry = (*p<<1) + carry;
         if (carry >= 10) {
            *p-- = carry - 10;
            carry = 1;
            if (digcnt == 1) {
               maxdig--;
               digcnt++;
            }
         }
         else {
            *p-- = carry;
            carry = 0;
         }
      } while(--digcnt);
   } while(--bitcnt);

   // Transform BCD to ASCII
   digcnt = DECIMAL_DIGITS;
   do *str-- += '0'; while (--digcnt);
}


void setup() {
   Serial.begin(9600);
}


void loop() {
   unsigned long timeit;
   unsigned char bin;
   char strnum[DECIMAL_DIGITS + 1];
   strnum[DECIMAL_DIGITS] = 0; // End of char

   timeit = millis();
   for(bin=0; bin<255; bin++) {
      bin8_to_str(strnum, bin);
   }
   timeit = millis() - timeit;
   Serial.print("1 conversion in ");
   Serial.print(timeit/2.55);
   Serial.println(" microseconds");
   delay(5000);
   
   for(bin=0; bin<255; bin++) {
      bin8_to_str(strnum, bin);
      Serial.println(strnum);
   }
}

Code: [Select]

1 conversion in 2.75 microseconds
000
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254

tridac · « **Reply #2 on:** March 27, 2024, 03:05:19 pm »

This is one of my standrd library functions. Edit as required...

/* Convert Binary to Bcd, Msd First */
/* -------------------------------- */
void vBin2Bcd (U32 u32Value, U8 *pu8BcdArray, U8 u8Count) { /* BcdArray has msd at array[0] on return */

U8 u8Index;

if (u32Value > 0) {
for (u8Index = 1; u8Index <= u8Count; u8Index++) {
pu8BcdArray [u8Count - u8Index] = (U8) (u32Value % 10); /* Buffer remainder */
u32Value /= 10; /* Next decade */
}
}

else { /* Don't convert zero */
for (u8Index = 0; u8Index < u8Count; u8Index++) {
pu8BcdArray [u8Index] = 0; /* Clear output */
}
}
}

A printing version:

/* Convert and Display 32 Bit Value as Decimal */
/* ------------------------------------------- */
void vPutDecimal (U32 u32Value) {

U8 u8Count = BUF_SIZE - 1;

u8Buffer [u8Count--] = 0x00; /* Terminate string */

do { /* First, split number into individual digits */
u8Buffer [u8Count--] = (U8) ((u32Value % 10) + 0x30); /* Get remainder, asciify & buffer */
} while ((u32Value /= 10) > 0 && u8Count > 0); /* Next decade, trap buffer overflow */

u8Count++; /* Correct count */
vPutString (&u8Buffer [u8Count]); /* Display, left justified */
}

Picuino · « **Reply #3 on:** March 27, 2024, 03:30:24 pm »

Quote from: tridac on March 27, 2024, 03:05:19 pm

This is one of my standrd library functions. Edit as required...

/* Convert Binary to Bcd, Msd First */
/* -------------------------------- */
void vBin2Bcd (U32 u32Value, U8 *pu8BcdArray, U8 u8Count) { /* BcdArray has msd at array[0] on return */

U8 u8Index;

if (u32Value > 0) {
for (u8Index = 1; u8Index <= u8Count; u8Index++) {
pu8BcdArray [u8Count - u8Index] = (U8) (u32Value % 10); /* Buffer remainder */
u32Value /= 10; /* Next decade */
}
}

else { /* Don't convert zero */
for (u8Index = 0; u8Index < u8Count; u8Index++) {
pu8BcdArray [u8Index] = 0; /* Clear output */
}
}
}

Good and very short and simple routine.

My rutine, on the other hand, it does not make use of integer arithmetic ( value%10 and value /= 10) it only makes use of binary number shifts.
This means that it can generate shorter and faster assembly code, especially on small micros that do not have large ALUs nor FPUs.

Picuino · « **Reply #4 on:** March 27, 2024, 03:41:37 pm »

My routine of 32 bits use at Arduino: 228 bytes of flash, 234us/conversion

Other routine of 32 bits use at Arduino: 156 bytes of flash, 380us/conversion

It seems that in the end the routine with divisions by 10 is shorter, despite the integer arithmetic required.

tridac · « **Reply #5 on:** March 27, 2024, 04:03:04 pm »

Iirc, originally used that sort of idea in 6502 asm, but yes, like the old debate around integer divide and multiply, the old shift and add / subtract method can save a lot of memory, math lib inclusion etc. Modern micros are not so encumbered though, orders or magnitude faster, perhaps integer multiply and divide, and more flash space, so tend to think clarity rather than memory economy.

radiolistener · « **Reply #6 on:** March 27, 2024, 05:31:33 pm »

Here is simple 32-bit binary to 8-digit BCD (32-bit) conversion:

Code: [Select]

uint32_t bin2bcd(uint32_t bin) {
    uint32_t bcd = 0;
    uint32_t factor = 1;
    while (bin) {
        bcd += (bin % 10) * factor;
        bin /= 10;
        factor <<= 4;
    }
    return bcd;
}

just wonder how much memory it will consume on Arduino and what is execution time in comparison to previously posted code.

Does your microcontroller supports hardware multiplication and divide for integer?

Picuino · « **Reply #7 on:** March 27, 2024, 07:37:52 pm »

32 bits binary to 10 digits BCD (with division value/10):

Code: [Select]

#define DECIMAL_DIGITS  10

void bin32_to_str(char *bcdArray, unsigned long value) {
  unsigned char index;

  if (value > 0) {
    for (index = 1; index <= DECIMAL_DIGITS; index++) {
      bcdArray[DECIMAL_DIGITS - index] = ((unsigned char) (value % 10)) + '0';
      value /= 10;
    }
  }
  else {
    for (index = 0; index < DECIMAL_DIGITS; index++) {
      bcdArray[index] = '0';
    }
  }
}


void setup() {
  Serial.begin(9600);
}


void loop() {
  unsigned long timeit;
  unsigned long bin;
  char strnum[DECIMAL_DIGITS + 1];
  strnum[DECIMAL_DIGITS] = 0; // End of char

  timeit = millis();
  for (bin = 0x80000000; bin < 0x80000000 + 1000; bin++) {
    bin32_to_str(strnum, bin);
  }
  timeit = millis() - timeit;
  Serial.print("1 Conversion in ");
  Serial.print(timeit);
  Serial.println(" microseconds");
  delay(5000);

  for (bin = 0; bin < 100000; bin++) {
    bin32_to_str(strnum, bin);
    Serial.println(strnum);
  }
}

Code: [Select]

1 Conversion in 386 microseconds
0000000000
0000000001
0000000002
0000000003
0000000004
0000000005
0000000006
0000000007
0000000008
0000000009
0000000010
0000000011
0000000012
0000000013
0000000014
0000000015
0000000016
0000000017
0000000018
0000000019
0000000020
0000000021
0000000022
0000000023
0000000024
0000000025
0000000026
0000000027
0000000028
0000000029
0000000030
0000000031
0000000032
0000000033
0000000034
0000000035
0000000036
0000000037
0000000038
0000000039
0000000040
0000000041
0000000042
0000000043
0000000044
0000000045
0000000046
0000000047
0000000048
0000000049
0000000050
0000000051
0000000052
0000000053
0000000054
0000000055
0000000056
0000000057
0000000058
0000000059
0000000060
0000000061
0000000062
0000000063
0000000064
0000000065
0000000066
0000000067
0000000068
0000000069
0000000070
0000000071
0000000072
0000000073
0000000074
0000000075
0000000076
0000000077
0000000078
0000000079
0000000080
0000000081
0000000082
0000000083
0000000084
0000000085
0000000086
0000000087
0000000088
0000000089
0000000090
0000000091
0000000092
0000000093
0000000094
0000000095
0000000096
0000000097
0000000098
0000000099
0000000100
0000000101
0000000102
0000000103
0000000104
0000000105
0000000106
0000000107
0000000108
0000000109
0000000110
0000000111
0000000112
0000000113
0000000114
0000000115
0000000116
0000000117
0000000118
0000000119
0000000120
0000000121
0000000122
0000000123
0000000124
0000000125
0000000126
0000000127
0000000128
0000000129
0000000130
0000000131
0000000132
0000000133
0000000134
0000000135
0000000136
0000000137
0000000138
0000000139
0000000140
0000000141
0000000142
0000000143
0000000144
0000000145
0000000146
0000000147
0000000148
0000000149
0000000150
0000000151
0000000152
0000000153
0000000154
0000000155
0000000156
0000000157
0000000158
0000000159
0000000160
0000000161
0000000162
0000000163
0000000164
0000000165
0000000166
0000000167
0000000168
0000000169
0000000170
0000000171
0000000172
0000000173
0000000174

Total flash memory code (function bin32_to_str): 2092-1926 = 166 bytes

Nominal Animal · « **Reply #8 on:** March 27, 2024, 07:41:37 pm »

For 8-bit microcontrollers like AVRs, I'd suggest repeated substraction instead. For example,

Code: [Select]

// SPDX-License-Identifier: CC0-1.0
#include <stdint.h>

static const uint32_t  u32_dec[20] = {
    UINT32_C(1),
    UINT32_C(3),
    UINT32_C(10),
    UINT32_C(30),
    UINT32_C(100),
    UINT32_C(300),
    UINT32_C(1000),
    UINT32_C(3000),
    UINT32_C(10000),
    UINT32_C(30000),
    UINT32_C(100000),
    UINT32_C(300000),
    UINT32_C(1000000),
    UINT32_C(3000000),
    UINT32_C(10000000),
    UINT32_C(30000000),
    UINT32_C(100000000),
    UINT32_C(300000000),
    UINT32_C(1000000000),
    UINT32_C(3000000000),
};

// Little-endian byte order
#define  BYTEINDEX(n, m)  (n)

// Big-endian byte order
// #define  BYTEINDEX(n, m)  ((m)-(n))

// Little-endian byte order
static const uint8_t  u32_bcd[20][2] = {
    { BYTEINDEX(0, 4), 0x01 },
    { BYTEINDEX(0, 4), 0x03 },
    { BYTEINDEX(0, 4), 0x10 },
    { BYTEINDEX(0, 4), 0x30 },

    { BYTEINDEX(1, 4), 0x01 },
    { BYTEINDEX(1, 4), 0x03 },
    { BYTEINDEX(1, 4), 0x10 },
    { BYTEINDEX(1, 4), 0x30 },

    { BYTEINDEX(2, 4), 0x01 },
    { BYTEINDEX(2, 4), 0x03 },
    { BYTEINDEX(2, 4), 0x10 },
    { BYTEINDEX(2, 4), 0x30 },

    { BYTEINDEX(3, 4), 0x01 },
    { BYTEINDEX(3, 4), 0x03 },
    { BYTEINDEX(3, 4), 0x10 },
    { BYTEINDEX(3, 4), 0x30 },

    { BYTEINDEX(4, 4), 0x01 },
    { BYTEINDEX(4, 4), 0x03 },
    { BYTEINDEX(4, 4), 0x10 },
    { BYTEINDEX(4, 4), 0x30 },
};

void binary32_to_bcd(uint8_t dst[5], uint32_t src)
{
    dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = 0;

    int8_t i = 20;
    while (i-->0)
        while (src >= u32_dec[i]) {
            src -= u32_dec[i];
            dst[u32_bcd[i][0]] += u32_bcd[i][1];
        }
}

takes about 258 bytes of Flash on AVR (gcc-5.4.0 -O2 -mmcu=atmega32u4), with the slowest case being 2888888888, 38 iterations of the inner loop total.

If you don't mind increasing the slowest case to 3999999999, 85 iterations of the inner loop total, you can use

Code: [Select]

// SPDX-License-Identifier: CC0-1.0
#include <stdint.h>

static const uint32_t  u32_dec[10] = {
    UINT32_C(1),
    UINT32_C(10),
    UINT32_C(100),
    UINT32_C(1000),
    UINT32_C(10000),
    UINT32_C(100000),
    UINT32_C(1000000),
    UINT32_C(10000000),
    UINT32_C(100000000),
    UINT32_C(1000000000),
};

// Little-endian byte order
#define  BYTEINDEX(n, m)  (n)

// Big-endian byte order
// #define  BYTEINDEX(n, m)  ((m)-(n))

// Little-endian byte order
static const uint8_t  u32_bcd[10][2] = {
    { BYTEINDEX(0, 4), 0x01 },
    { BYTEINDEX(0, 4), 0x10 },

    { BYTEINDEX(1, 4), 0x01 },
    { BYTEINDEX(1, 4), 0x10 },

    { BYTEINDEX(2, 4), 0x01 },
    { BYTEINDEX(2, 4), 0x10 },

    { BYTEINDEX(3, 4), 0x01 },
    { BYTEINDEX(3, 4), 0x10 },

    { BYTEINDEX(4, 4), 0x01 },
    { BYTEINDEX(4, 4), 0x10 },
};

void binary32_to_bcd(uint8_t dst[5], uint32_t src)
{
    dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = 0;

    int8_t i = sizeof u32_dec / sizeof u32_dec[0];
    while (i-->0)
        while (src >= u32_dec[i]) {
            src -= u32_dec[i];
            dst[u32_bcd[i][0]] += u32_bcd[i][1];
        }
}

which compiles to 198 bytes of flash (gcc-5.4.0, -O2 -mmcu=atmega32u4).

cv007 · « **Reply #9 on:** March 27, 2024, 08:48:35 pm »

Quote

The first application I give them in new systems is to send by UART binary numbers to the computer in decimal format.

The first thing I get any new mcu to do is to use the pins, then use the uart via printf or similar. Once in place you are set for any kind of formatting you want, with no need to create bits and pieces of code that attempts to do the same. The size, once in place, remains the same so the cost is one time and 'free' to use after the first use so use it everywhere you can.

https://godbolt.org/z/cMffYMzf4
A little over 2k in size, of which printf is probably 3/4 of that. If my avr couldn't handle the extra 1.5k, I would move up to the next avr flash size. Having this is place is much more valuable than the loss of 1.5k (of which you replace with your bits/pieces of code anyway, so more like 1k).

I also have a C++ header for cout style printing, and the example above would be like so (and be about 500 bytes smaller in size)-

Code: [Select]

uart, '[', dec_(10,millis()), "] i: ", i++, endl; // using , instead of <<  and  dec_ is decimal with width of 10 and leading spaces

a header with about 500 lines of code, but is basically 2 relatively simple functions that do the work (excluding floating point, which is another function)- format a number to something readable using already set formatting options and output a string, again using width/align options already in place (everything ends up in this).

Unless you like the challenge of fitting max code into a small mcu (like 2k), just get a printf in place and output whatever you desire and you can spend more time doing things like learning printf syntax.

SiliconWizard · « **Reply #10 on:** March 27, 2024, 11:34:07 pm »

Note that unless you have to deal with a lot of digits, on small targets, a very simple approach is often not just much simpler, but also actually faster.
You just need to count each digit by powers of ten. It's a very crude division algorithm, but for this purpose it works quite well. Nominal showed an improved version of that, but one that will take less flash (likely) is just to directly count in decimal.
If you just need a few digits, no need to get more complicated than this. You just need an array of the powers of 10 corresponding to the number of digits you need to deal with. It can even be unrolled if the number of digit is small. Say for 4 digits:

nDigit1000 = 0;
while (n >= 1000)
{
nDigit1000++;
n -= 1000;
}

nDigit100 = 0;
while (n >= 100)
{
nDigit100++;
n -= 100;
}

... and so on. If the number of digits is greater than this, or you want something more compact and easier to scale up, you can use arrays instead of this loop unrolling.
You can use Nominal's approach for something a bit more efficient.

But the direct decimal subtraction is often the simplest and fastest one on small targets that don't even have a hardware multiplier (or that have one that's pretty slow) with which optimizing compilers will usually implement the dvision by 10, if you go for the mod 10, divide by 10 approach. It may look more appropriate, but it rarely is on small embedded targets.

Nominal Animal · « **Reply #11 on:** March 28, 2024, 06:54:11 am »

There are several threads on related interesting topics in the Programming child board; just browse the thread titles for a few dozen pages past.

rhodges · « **Reply #12 on:** April 04, 2024, 02:48:41 am »

For every architecture, I write binary to BCD and ASCII in the native assember. After three decades, I don't find the idea too hard. My approach is to use or emulate the DAA. I recently posted my code on the thread called "comments".

I consider the divide a "no go" unless the CPU has a quick hardware divide. For example, the STM8 has a hardware divide that makes the conversion fast for small numbers. Otherwise, I suggest the shift and "DAA" method.

T3sl4co1l · « **Reply #13 on:** April 04, 2024, 05:04:36 am »

Have probably posted this before somewhere in the past, but as long as it's topical...

You can divide by multiplying by a reciprocal, assuming the reciprocal is known of course. This even allows obtaining the remainder.

Something I wrote years ago:

Code: [Select]

;
;     uDivTen
;
; Unsigned division by ten.  45 words, takes 60 cycles (plus rcall).
;
; Input:
;   r17:r16 = operand
; Output:
;   r1:r0 = quotient
;   r3:r2 = remainder
;   (r16 = 10, r17 = 0)
;
uDivTen:
		push r7				; register usage:
		push r6				; r7:r6 = operand (in[0..1])
		push r5				; r5:r4:r3:r2 = 40 bit accumulator, D[1..3] (byte 0 discarded)
		push r4				; r1:r0 = partial products
		movw r6, r16			; r16 = temp, r17 = zero
		clr r17				; (but first, save operand)

		; multiply by K = 2^20 * 16 / 10 to make D[0..3] = in[0..1] * K[0..2]
		; (implicitly discarding D[0])

		ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16
		mul r7, r16			; in[1] * K[2]
		movw r4, r0			; save high word

		ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 0)) >> 0
		mul r7, r16			; in[1] * K[0]
		movw r2, r0			; save low word

		mul r6, r16			; in[0] * K[0]
		add r2, r1			; accumulate to D[1] (discard lowest byte)
		adc r3, r17
		adc r4, r17
		adc r5, r17

		ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 8)) >> 8
		mul r6, r16			; in[0] * K[1]
		add r2, r0			; accumulate to D[1..2]
		adc r3, r1
		adc r4, r17
		adc r5, r17

		mul r7, r16			; in[1] * K[1]
		add r3, r0			; accumulate to D[2..3]
		adc r4, r1
		adc r5, r17


		ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16
		mul r6, r16			; in[0] * K[2]
		add r3, r0			; accumulate to D[2..3]
		adc r4, r1
		adc r5, r17

; dig remainder out of the fractional part

		ldi r16, 0x10			; rounding bit
		add r3, r16
		ldi r16, 10
		mul r3, r16			; frac * 10
		mov r2, r1
		clr r3
		movw r0, r4			; quotient out

; r3 = 0, r2 = [0...9], r1:r0 = [0...6553]
		pop r4
		pop r5
		pop r6
		pop r7
		ret
; END PROC uDivTen

Using an ad-hoc / non-standard ABI, but that's easily adapted by those in the know. That "40 bit" accumulator in the middle seems intimidating, or over-the-top, but IIRC, one or two extra bits are required to complete the full result (including remainder) and making use of the register is most likely better than any bit twiddling hacks you might do otherwise. (And, good luck getting the compiler to generate that..!)

I've never cared for BCD, and just do the x / 10, x % 10 loop to generate decimals. You'd have to be running a lot of digits to need any kind of speed. You're most likely doing it in the main() loop at a low refresh rate, for which any method will do -- my most recent codebases just let the C compiler figure it out, or use a library call such as itoa(). I haven't ran out of Flash or SRAM in a long time (...or ever?), not that I do very large projects, but still, a typical 64k AVR can hold a lot of crufty code no problem.

Tim

bson · « **Reply #14 on:** April 08, 2024, 09:10:06 pm »

For ARM, shift-and-add/subtract is probably the most efficient since it can do both in a single instruction.

I have a small utility at https://github.com/bson/divfac which factors unsigned integer division by a constant into shift-and-add. For example:

Code: [Select]

$ divfac.py 10 div10
template <typename T>
static inline T div10(const T& val) {
  return (val >> 3) - (val >> 5) + (val >> 7) - (val >> 9) + (val >> 11) - (val >> 13) + (val >> 15) - (val >> 17) + (val >> 19) - (val >> 21) + (val >> 23) - (val >> 25) + (val >> 27) - (val >> 29) + (val >> 31);
}

Obviously it spits out C++ in a way to allow running it from makefiles to easily generate header files to be included for common divisors.

There's clearly a pattern here that would make it easy to manually extend to 64 bits, but personally I've never had a use for that. For smaller T's like uint16_t, many of the terms are 0 and get eliminated during optimization.

With inline ARM assembly, a 32-bit divide-by-10 could look something like:

Code: [Select]

#include <stdint.h>

static inline uint32_t div10(const uint32_t value) {
    uint32_t result;
    asm("lsr   %0, %1, #3\n\t"
        "sub   %0, %1, lsr #5\n\t"
        "add   %0, %1, lsr #7\n\t"
        "sub   %0, %1, lsr #9\n\t"
        "add   %0, %1, lsr #11\n\t"
        "sub   %0, %1, lsr #13\n\t"
        "add   %0, %1, lsr #15\n\t"
        "sub   %0, %1, lsr #17\n\t"
        "add   %0, %1, lsr #19\n\t"
        "sub   %0, %1, lsr #21\n\t"
        "add   %0, %1, lsr #23\n\t"
        "sub   %0, %1, lsr #25\n\t"
        "add   %0, %1, lsr #27\n\t"
        "sub   %0, %1, lsr #29\n\t"
        "add   %0, %1, lsr #31"
        : "+r" (result)
        : "r" (value));

    return result;
}

Of course, a 16-bit divide would be shorter since half the terms can be removed.

Doctorandus_P · « **Reply #15 on:** April 16, 2024, 01:31:08 pm »

I would use ldiv( ) It does a division, and returns both the answer, and the remainer in a structure. You can also use a standard function such as itoa( ) (Integer to Ascii). Also studying the code in the standard libraries (from an Open Source compiler such as GCC) can be an excellent educational tool for leaning different types of algorithms used in programming.

Below a function I wrote years ago. Most of it is house keeping and ascii formatting (sings, dots, etc) to convert an integer into a fixed point interpretated ascii string. The while loop in the middle does the actual conversion with ldiv( ). Do note it writes the result "backwards" to the destination string (array).

// String Print a number with Fixed width and precision (decimal point). // Signed version. char* SPrintFixed( char* Buf, int32_t n, int8_t length, int8_t Precision) { char Sign, *pEnd = Buf + length; ldiv_t Number; if( n < 0) { n = -n; Sign = '-'; } else Sign = ' '; Buf[ length] = 0x00; Number.quot = n; do { Number = ldiv( Number.quot, 10); Buf[--length] = Number.rem + '0'; if( --Precision == 0) Buf[--length] = '.'; }while( (length > 0) && ((Number.quot) || (Precision >= 0)) ); if( Sign == '-') { if( length > 0) { Buf[--length] = '-'; } } while( length > 0) { // Fill remaining space with spaces. Buf[--length] = ' '; } return pEnd; }

cv007 · « **Reply #16 on:** April 16, 2024, 06:06:59 pm »

Quote

I would use ldiv( )

You will be better off letting the compiler handle it. It can figure out when you are needing quotient/remainder and then call a divmod library function which is already optimized for the particular mcu. Forcing the use of ldiv when the compiler could have done something better (much better depending on mcu) means you are crippling the more capable mcu's.

So like most things, write code that is clear and easy to understand and let the compiler optimize, which it does quite well.

Doctorandus_P · « **Reply #17 on:** April 23, 2024, 02:46:29 am »

Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?
And how do you know that ldiv is not optimized?

Saying this as a blanket statement is not very trustworthy. Modern compilers do weird things to code during optimisation, and if performance really is that important you should do some benchmarking on the uC / compiler you use.

And as for readability, I'd say my posted code is very readable, but I'm biased of course.
At least ldiv is interesting enough to mention in this thread.

cv007 · « **Reply #18 on:** April 24, 2024, 07:18:22 pm »

Quote

Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?

I mean compiler writers are not unaware of these things and may already generate optimal code given simple directions in the form of user code. In the case of an mcu with hardware division, letting the compiler decide means you will be letting the compiler use the hardware division instead of forcing a call to an ldiv library function, or some other user created division function, which will be much slower and therefore defeats the purpose of the extra effort to optimize on your own.

Simple example for an avr-
https://godbolt.org/z/TsvWeTof4
They both make a single call to __divmodhi4 and the compiler sees the need for both q/r in the 'normal' version even though the div/mod statements were separated by code. Writing straight forward code in this case results in (almost) exactly the same code.

A second example, but for a cortex-m
https://godbolt.org/z/GEbY8n1nn
In this case, the compiler for whatever reason needs the divisor in a global var before it will do only the single __aeabi_idivmod call, else will get an additional __aeabi_idiv call. Even if you do not know this quirk and end up with 2 division calls its not something to worry about- you will not notice any effects that your formatting code is not optimal. Once you move over to an M3 and above, the compiler will start using hardware division and you had to do nothing to get that, where the ldiv (or any custom code) user will still be using the slow library call as the compiler will be forced to use it. (You can change the command line option to cortex-m3,m4,m33,etc. and see what you get for both functions).

So as I suggested, spend your time getting the high level code to be both readable and reliable and let the compiler handle optimization. In the case of formatting, there is little need to worry about perfect optimization.

Nominal Animal · « **Reply #19 on:** April 24, 2024, 08:42:59 pm »

Quote from: Doctorandus_P on April 23, 2024, 02:46:29 am

At least ldiv is interesting enough to mention in this thread.

Absolutely!

Quote from: Doctorandus_P on April 23, 2024, 02:46:29 am

Do you mean with this that the compiler will optimize separate remeainer and division operations but not an ldiv?

Considering the abstract machine model the C standard is based on, I'd say C compilers have more opportunities to optimize the integer-divide-and-modulo expressions compared to the div()/ldiv()/lldiv()/imaxdiv() function call.

If you use e.g. Compiler Explorer to examine the code generated at different optimization levels for different architectures from

Code: [Select]

typedef  unsigned int  uint;

typedef struct {
    uint           quotient;
    unsigned char  remainder;
} pair;

pair div10(uint n) { return (pair){ .quotient = n / 10, .remainder = n % 10 }; }

you'll see it varies depending on optimization options.

For example, for 8-bit AVR (ATmega32U4) using GCC 13.2.0 and -Os, it compiles to a simple __udivmodhi4 call. With -O2, it is open-coded. For 32-bit architectures using GCC and Clang and -O2, it typically compiles to a multiplication by 0xcccccccd ("0.1"); with -Os, to a division by 10 (wherever the instruction set has a 32-bit unsigned divide instruction).

This does not mean that /% produces better code, though; only that the compiler has more opportunities for optimization.

I do not believe in micro-optimizations, or optimizing specific functions. Hot functions should get inlined, and there the opportunities for optimization tend to matter more. It is not just the operation itself, you see, but everything else surrounding and supporting the computation operation.

The best illustration of a similar issue I know of, is Radix-sorting IEEE 754 double-precision floating point numbers (double). When their byte order is the same as unsigned integers, one only needs a preprocessing pass of flipping all bits if the sign bit is set, or only the sign bit if it is not set, to make all finite values sort in the same order as the 64-bit unsigned integer representation (after the bit flipping). Depending on the L1-cache/D-cache size, you do four to eight passes, plus the final unflipping-bits pass. This scales linearly: as it is a radix sort, it indeed sorts the finite floating-point values in O(N) time complexity. However, its cache footprint is horrible, evicting basically everything else from the L1/D-cache, so using it slows all other code running after the sort, because it ends up having to refill the cache. Thus, you need to have hundreds of millions of numbers or more, before it ends up being faster/better than QuickSort and other O(N²) and O(N log N) time complexity (but typically fast) sorting algorithms. And if the other stuff is cache-sensitive, maybe not even then; while the sort may be done faster, the other code needed to do the actual real-world task may take correspondingly longer time (due to the cache misses). (Similarly, when multiplying a pair of dense matrices, you don't need very large matrices before it becomes worthwhile to copy the data so that they are both accessed in as compact order as possible, for maximum cache locality.)

T3sl4co1l · « **Reply #20 on:** April 25, 2024, 04:58:19 am »

Quote from: Nominal Animal on April 24, 2024, 08:42:59 pm

The best illustration of a similar issue I know of, is Radix-sorting IEEE 754 double-precision floating point numbers (double). When their byte order is the same as unsigned integers, one only needs a preprocessing pass of flipping all bits if the sign bit is set, or only the sign bit if it is not set, to make all finite values sort in the same order as the 64-bit unsigned integer representation (after the bit flipping). Depending on the L1-cache/D-cache size, you do four to eight passes, plus the final unflipping-bits pass. This scales linearly: as it is a radix sort, it indeed sorts the finite floating-point values in O(N) time complexity. However, its cache footprint is horrible, evicting basically everything else from the L1/D-cache, so using it slows all other code running after the sort, because it ends up having to refill the cache. Thus, you need to have hundreds of millions of numbers or more, before it ends up being faster/better than QuickSort and other O(N²) and O(N log N) time complexity (but typically fast) sorting algorithms. And if the other stuff is cache-sensitive, maybe not even then; while the sort may be done faster, the other code needed to do the actual real-world task may take correspondingly longer time (due to the cache misses). (Similarly, when multiplying a pair of dense matrices, you don't need very large matrices before it becomes worthwhile to copy the data so that they are both accessed in as compact order as possible, for maximum cache locality.)

Related perspective shift: CPU time is essentially free these days, and often this is the case on more historic platforms too (most anything with slow RAM, caches, etc.). Doing float comparisons might be relatively onerous in raw cycle count (depending on platform; they've certainly improved over the years), and the integer hack is so greedily tempting. But if you're spending all your time fetching and storing, that... doesn't matter at all.

Tim

John Coloccia · « **Reply #21 on:** **Yesterday** at 01:00:38 am »

Depends on the processor. Some processors fall flat on their face with floating point calculations.

But yes, many cases of poor performance are due to IO bounds, not computation.

Is there an actual problem that's trying to be solved here?

Nominal Animal · « **Reply #22 on:** **Yesterday** at 02:15:52 am »

Quote from: John Coloccia on Yesterday at 01:00:38 am

Is there an actual problem that's trying to be solved here?

Not really. Binary to decimal conversion is considered "slow" and requiring "surprisingly much" machine-level code, compared to arithmetic operations, so these can be useful in some cases, but that's about it.

Library implementations, especially various printf() implementations, are written for correctness and not for efficiency, so when you work with smaller microcontrollers, especially 8-bit ones (AVRs, PICs, 8051s), a custom implementation can be useful. More annoyingly, most standard library functions use arbitrary precision arithmetic and dynamic memory allocation during conversion of floating-point numbers.

Even on fully hosted C environments, it turns out that if you read numeric data from text files, the standard C library string-to-number conversions (strtod(), strtol(), scanf() and their variants) become the bottleneck, when you have enough (megabytes) of data. Then, too, "optimized" conversion functions can reduce the load times to a fraction of what they would be using standard C conversion functions.

SiliconWizard · « **Reply #23 on:** **Yesterday** at 03:39:04 am »

Quote from: Nominal Animal on Yesterday at 02:15:52 am

Quote from: John Coloccia on Yesterday at 01:00:38 am
Is there an actual problem that's trying to be solved here?
Not really. Binary to decimal conversion is considered "slow" and requiring "surprisingly much" machine-level code, compared to arithmetic operations, so these can be useful in some cases, but that's about it.

Library implementations, especially various printf() implementations, are written for correctness and not for efficiency, so when you work with smaller microcontrollers, especially 8-bit ones (AVRs, PICs, 8051s), a custom implementation can be useful. More annoyingly, most standard library functions use arbitrary precision arithmetic and dynamic memory allocation during conversion of floating-point numbers.

Even on fully hosted C environments, it turns out that if you read numeric data from text files, the standard C library string-to-number conversions (strtod(), strtol(), scanf() and their variants) become the bottleneck, when you have enough (megabytes) of data. Then, too, "optimized" conversion functions can reduce the load times to a fraction of what they would be using standard C conversion functions.

Yep. scanf() being the absolute dog of them all.

And sometimes the "simplest" solution is actually also the most efficient. As I said before, the simple approach to count powers of ten, requiring "only" up to 9 iterations per digit with just a subtraction and an increment per iteration, will be faster on most targets that do not have a hardware multiplier (with which most optimizing compiler will implement the divide by 10), or on which the hardware multiplier is a multi-cycle operation. In particular, I've used the solution I mentioned earlier on AVR targets and it led to a lot smaller code size, and much faster too. Don't let the fact that it looks too "naive" deter you - looking at the generated assembly if in doubt will help picking the most efficient approach, in simple cases like this.

Picuino · « **Reply #24 on:** **Yesterday** at 11:49:49 am »

Performing the binary-to-decimal conversion with custom routines has helped me to be able to dump huge numbers (48 bits long) from the counter of an 8-bit microcontroller that counted 10MHz pulses for long periods of time and dumped the count by UART every second.


EEVblog Main Site	EEVblog on Youtube	EEVblog on Twitter	EEVblog on Facebook	EEVblog on Odysee

Author Topic: Routines to convert binary to BCD in C code (Read 2887 times)

Share me