and dumped the count by UART every second
That is one of the points I am making- speed of conversion does not matter at all if outputting only every second. Even when you want to dump lots of (formatted) data it matters very little.
You original post was focused on the conversion speed, but if you had actually output all the data at your specified 9600 baud (960 bytes/sec) you would have found out that any formatting you can come up with will be blocking on the uart as it sends out the data, whether you used the arduino print which you had access to or your own custom bcd conversion.
Do what you desire, but using existing code for formatting such as printf or arduino print will be plenty fast. Code size will also be of little concern unless you have a 4k or less mcu. In the case of an avr, the printf code is about 1.5k and once brought in you can use it as much as you want (so use it for everything that needs formatting). For arduino (which I don't use) I imagine no one using that will concern themselves with how the details of formatting are done as they just use print when the need arises.
printf( "%lu\n", my10MHzCounter() ); //%lu for 32bits on avr
Serial.println( my10MHzCounter() );
In this project, the problem was to print 48-bit numbers (16 bits of the main uC counter + 32 bits of the carry counter) in decimal (15 decimal digits).
In that case the speed was not important, but to be able to do conversions of many bits. I solved it with custom subroutines for 48bit bit-shift and the algorithm I have previously published.
With a 32-bit microcontroller it is simple, but with an 8-bit microcontroller you have to do the job differently.
but with an 8-bit microcontroller you have to do the job differently.
Differently, but can still use the tools already in the toolbox.
uint64_t v = readCount();
uint32_t vH = v / 1000000000u;
uint32_t vL = v % 1000000000u;
//skip leading 0's, avr lu is uint32_t
if( vH ) print( "%lu%09lu\n", vH, vL ); else print ( "%lu\n", vL );
Here is a version that seems to be a bit faster for larger numbers:
void bin32_to_str2(unsigned long int val, char * s)
{
unsigned char res0, res1, res2, res3, res4;
unsigned char j, cy1, cy2;
res0=res1=res2=res3=res4=0x00;
for(j=0; j<32; j++)
{
cy1=(val&0x80000000)?1:0;
val<<=1;
if(res0>0x04)
{
if((res0&0x0f)>0x04) res0+=0x03;
if((res0&0xf0)>0x40) res0+=0x30;
}
if(res1>0x04)
{
if((res1&0x0f)>0x04) res1+=0x03;
if((res1&0xf0)>0x40) res1+=0x30;
}
if(res2>0x04)
{
if((res2&0x0f)>0x04) res2+=0x03;
if((res2&0xf0)>0x40) res2+=0x30;
}
if(res3>0x04)
{
if((res3&0x0f)>0x04) res3+=0x03;
if((res3&0xf0)>0x40) res3+=0x30;
}
if(res3>0x04)
{
if((res4&0x0f)>0x04) res4+=0x03;
if((res4&0xf0)>0x40) res4+=0x30; // may not be needed
}
cy2=(res0&0x80)?1:0;
res0<<=1;
res0|=cy1;
cy1=(res1&0x80)?1:0;
res1<<=1;
res1|=cy2;
cy2=(res2&0x80)?1:0;
res2<<=1;
res2|=cy1;
cy1=(res3&0x80)?1:0;
res3<<=1;
res3|=cy2;
res4<<=1;
res4|=cy1;
}
s[0]=(res4>>4) | '0';
s[1]=(res4&0x0f)| '0';
s[2]=(res3>>4) | '0';
s[3]=(res3&0x0f)| '0';
s[4]=(res2>>4) | '0';
s[5]=(res2&0x0f)| '0';
s[6]=(res1>>4) | '0';
s[7]=(res1&0x0f)| '0';
s[8]=(res0>>4) | '0';
s[9]=(res0&0x0f)| '0';
s[10]=0;
}
Some quick tests:
Picuino bin32_to_str() ----------
T: 244.187500 us
Number: 3114455555
T: 121.000000 us
Number: 0000055555
T: 83.250000 us
Number: 0000000005
jesuscf bin32_to_str2() ----------
T: 160.624980 us
Number: 3114455555
T: 124.625000 us
Number: 0000055555
T: 113.937510 us
Number: 0000000005
My version uses 342 bytes, while Picuino's version uses only 164 bytes. The code was compiled with AVR-gcc and ran in a ATMega328p with a 16MHz crystal. I think Picuino's version can be made faster if it works with packed BCD numbers.
Here is a version that seems to be a bit faster for larger numbers:
Care to compare to the
u32_bcd(uint8_t bcd[5], uint32_t val) or
u32_str(char *dst, uint32_t val) below?
// SPDX-License-Identifier: CC0-1.0
#include <stdint.h>
const uint32_t u32_bcd_bit[9] = {
800000000,
80000000,
8000000,
800000,
80000,
8000,
800,
80,
8,
};
void u32_bcd(uint8_t *dst, uint32_t val)
{
const uint32_t *ptr = u32_bcd_bit;
const uint8_t end = sizeof u32_bcd_bit + (uint8_t)(uintptr_t)u32_bcd_bit;
uint32_t b10 = 4000000000;
uint8_t bit = 0x40;
uint8_t bcd = 0;
while (1) {
while (1) {
if (val >= b10) {
val -= b10;
bcd |= bit;
}
bit >>= 1;
if (!bit)
break;
else
if (bit == 0x08)
b10 = *(ptr++);
else
b10 >>= 1;
}
*(dst++) = bcd;
if ((uint8_t)(uintptr_t)ptr == end)
break;
b10 = *(ptr++);
bit = 0x80;
bcd = 0x00;
}
}
void u32_str(uint8_t *dst, uint32_t val)
{
const uint32_t *ptr = u32_bcd_bit;
const uint8_t end = sizeof u32_bcd_bit + (uint8_t)(uintptr_t)u32_bcd_bit;
uint32_t b10 = 4000000000;
uint8_t bit = 4;
uint8_t dig = 0;
while (1) {
while (1) {
if (val >= b10) {
val -= b10;
dig += bit;
}
bit >>= 1;
if (!bit)
break;
b10 >>= 1;
}
if (dig) {
if (dig < 10)
dig += '0';
*(dst++) = dig;
dig = '0';
}
if ((uint8_t)(uintptr_t)ptr == end)
break;
b10 = *(ptr++);
bit = 8;
}
if (!dig)
*(dst++) = '0';
*dst = '\0';
}
The table takes 36 bytes of Flash,
u32_bcd() 132 bytes, and
u32_str() 130 bytes, for a total of 298 bytes using
avr-gcc-5.4.0 -Os -mmcu=atmega328p; 150 and 154 bytes if using
-O2. Note that
u32_str() omits leading zeroes, producing the same output as
snprintf(buf, 12, "%lu", val);. They have both been verified to work correctly for all possible
uint32_t values, too.
u32_bcd() returns the bcd digits in most significant byte first. You can use
void u32_bcd(uint8_t *dst, uint32_t val)
{
const uint32_t *ptr = u32_bcd_bit;
const uint8_t end = sizeof u32_bcd_bit + (uint8_t)(uintptr_t)u32_bcd_bit;
uint32_t b10 = 4000000000;
uint8_t bit = 0x40;
uint8_t bcd = 0;
dst += 5;
while (1) {
while (1) {
if (val >= b10) {
val -= b10;
bcd |= bit;
}
bit >>= 1;
if (!bit)
break;
else
if (bit == 0x08)
b10 = *(ptr++);
else
b10 >>= 1;
}
*(--dst) = bcd;
if ((uint8_t)(uintptr_t)ptr == end)
break;
b10 = *(ptr++);
bit = 0x80;
bcd = 0x00;
}
}
for least significant byte first.
I don't think these are the fastest ones, especially for smaller numbers (because they do not short-circuit large numbers and always do the same number of iterations), but I found the approach interesting, and they might be useful if one needs both BCD and no-leading-zeroes decimal conversion.
Care to compare to the u32_bcd(uint8_t bcd[5], uint32_t val) or u32_str(char *dst, uint32_t val) below?
Here it is:
Picuino bin32_to_str() ----------
T: 235.00us
Number: 3114455555
T: 118.44us
Number: 0000055555
T: 82.69us
Number: 0000000005
jesuscf bin32_to_str2() ----------
T: 163.44us
Number: 3114455555
T: 89.25us
Number: 0000055555
T: 45.25us
Number: 0000000005
Nominal Animal u32_str() ----------
T: 52.75us
Number: 3114455555
T: 49.06us
Number: 55555
T: 45.31us
Number: 5
I slightly improved the speed of my function, but
Nominal Animal's function is clearly faster! Not only faster but much smaller as well!
For reference, this is what I did:
void bin32_to_str2(unsigned long int val, char * s)
{
static unsigned char res0, res1, res2, res3, res4;
static unsigned char j, cy1, cy2;
res0=res1=res2=res3=res4=0x00;
for(j=0; j<32; j++)
{
if (val&0x80000000) break;
val<<=1;
}
for(; j<32; j++)
{
cy1=(val&0x80000000)?1:0;
val<<=1;
if(res0>0x04)
{
if((res0&0x0f)>0x04) res0+=0x03;
if((res0&0xf0)>0x40) res0+=0x30;
}
if(res1>0x04)
{
if((res1&0x0f)>0x04) res1+=0x03;
if((res1&0xf0)>0x40) res1+=0x30;
}
if(res2>0x04)
{
if((res2&0x0f)>0x04) res2+=0x03;
if((res2&0xf0)>0x40) res2+=0x30;
}
if(res3>0x04)
{
if((res3&0x0f)>0x04) res3+=0x03;
if((res3&0xf0)>0x40) res3+=0x30;
}
if(res4>0x04)
{
if((res4&0x0f)>0x04) res4+=0x03;
//if((res4&0xf0)>0x40) res4+=0x30; // Never larger than 0x40
}
cy2=(res0&0x80)?1:0;
res0=(res0<<1)|cy1;
cy1=(res1&0x80)?1:0;
res1=(res1<<1)|cy2;
cy2=(res2&0x80)?1:0;
res2=(res2<<1)|cy1;
cy1=(res3&0x80)?1:0;
res3=(res3<<1)|cy2;
res4<<=1;
res4|=cy1;
}
s[0]=(res4>>4) | '0';
s[1]=(res4&0x0f)| '0';
s[2]=(res3>>4) | '0';
s[3]=(res3&0x0f)| '0';
s[4]=(res2>>4) | '0';
s[5]=(res2&0x0f)| '0';
s[6]=(res1>>4) | '0';
s[7]=(res1&0x0f)| '0';
s[8]=(res0>>4) | '0';
s[9]=(res0&0x0f)| '0';
s[10]=0;
}
I'm surprised it did that well; but having roughly same duration regardless of the value converted was expected.
Anyway, the idea behind u32_bcd() and u32_str() is the good ol' repeated subtraction.
I simply realized that the AVR instruction set is such that it can shift the 32-bit value down by one bit in four cycles; and if we start with a power of ten multiplied by 8, we can shift it down in four steps and cover all possible values at that decimal position. So, we only need eight multiplied by all powers of ten. The most significant position cannot support an eight, but it can a four, so the loop starts with that special value.
It also happens that decimal digits 0 through 9 have at most three bits set, so each decimal position simplifies to at most three 32-bit subtractions (but four shifts of the 32-bit value). The worst case value for both functions it is 3777777777 = 0xe12c5071, which does the most 32-bit subtractions (29).
In essence, it is one of those "funky" combinations of powers of two and powers of ten logic.
On a totally related question: how do you guys find out the size of a function in bytes? I was checking the .map file generated by the linker, but it only gives the start of the function, not the size. Based on the map file, I think I calculated the size of my function incorrectly. Now I am trying to find the size of a function by doing something like this in the code:
void u32_str(uint8_t *dst, uint32_t val)
{
.
.
.
lbl_end:
sub_start=((unsigned int)u32_str)*2;
sub_end=((unsigned int)&&lbl_end)*2;
}
Then I print (sub_end-sub_start) where sub_start and sub_end are global variables. Does it make sense?
I managed to improve a little bit my function, but not near Nominal Animal's u32_str() yet:
Picuino bin32_to_str() ----------
T: 234.31us
Number: 3114455555
T: 117.75us
Number: 0000055555
T: 82.00us
Number: 0000000005
Start: 0x0222, End: 0x02be, Bytes: 156
jesuscf bin32_to_str2() ----------
T: 124.88us
Number: 3114455555
T: 63.69us
Number: 0000055555
T: 30.25us
Number: 0000000005
Start: 0x03b8, End: 0x0444, Bytes: 140
Nominal Animal u32_str() ----------
T: 51.88us
Number: 3114455555
T: 48.19us
Number: 55555
T: 44.44us
Number: 5
Start: 0x02fe, End: 0x037a, Bytes: 124
And for reference here is my slightly improved function:
void bin32_to_str2(char * s, unsigned long int val)
{
unsigned char res0, res1, res2, res3, res4;
unsigned char j;
res0=res1=res2=res3=res4=0x00;
for(j=0; j<32; j++)
{
if (val&0x80000000) break;
val<<=1;
}
for(; j<32; j++)
{
if(res0>0x04)
{
if((res0&0x0f)>0x04) res0+=0x03;
if((res0&0xf0)>0x40) res0+=0x30;
}
if(res1>0x04)
{
if((res1&0x0f)>0x04) res1+=0x03;
if((res1&0xf0)>0x40) res1+=0x30;
}
if(res2>0x04)
{
if((res2&0x0f)>0x04) res2+=0x03;
if((res2&0xf0)>0x40) res2+=0x30;
}
if(res3>0x04)
{
if((res3&0x0f)>0x04) res3+=0x03;
if((res3&0xf0)>0x40) res3+=0x30;
}
if(res4>0x04)
{
if((res4&0x0f)>0x04) res4+=0x03;
//if((res4&0xf0)>0x40) res4+=0x30; // Never larger than 0x40
}
res4<<=1;
if(res3&0x80) res4|=1;
res3<<=1;
if(res2&0x80) res3|=1;
res2<<=1;
if(res1&0x80) res2|=1;
res1<<=1;
if(res0&0x80) res1|=1;
res0<<=1;
if (val&0x80000000) res0|=1;
val<<=1;
}
s[0]=(res4>>4) | '0';
s[1]=(res4&0x0f)| '0';
s[2]=(res3>>4) | '0';
s[3]=(res3&0x0f)| '0';
s[4]=(res2>>4) | '0';
s[5]=(res2&0x0f)| '0';
s[6]=(res1>>4) | '0';
s[7]=(res1&0x0f)| '0';
s[8]=(res0>>4) | '0';
s[9]=(res0&0x0f)| '0';
s[10]=0;
lbl_end:
sub_start=((unsigned int)bin32_to_str2)*2;
sub_end=((unsigned int)&&lbl_end)*2;
}
On a totally related question: how do you guys find out the size of a function in bytes?
Find the object file, or compile your say
stuff.c to an object file using e.g.
avr-gcc -Wall Os -mmcu=atmega328p -c stuff.cand then look at the symbol sizes in the symbol table:
avr-objdump -t stuff.oThe size is the second hexadecimal number just left to the symbol name at extreme right. For example, on the code I posted, the output is
8.o: file format elf32-avr
SYMBOL TABLE:
00000000 l df *ABS* 00000000 stuff.c
00000000 l d .text 00000000 .text
00000000 l d .data 00000000 .data
00000000 l d .bss 00000000 .bss
0000003e l *ABS* 00000000 __SP_H__
0000003d l *ABS* 00000000 __SP_L__
0000003f l *ABS* 00000000 __SREG__
00000000 l *ABS* 00000000 __tmp_reg__
00000001 l *ABS* 00000000 __zero_reg__
00000000 l d .rodata 00000000 .rodata
00000000 l d .comment 00000000 .comment
00000000 g F .text 0000008c u32_bcd
00000000 g O .rodata 00000024 u32_bcd_bit
0000008c g F .text 00000088 u32_str
00000000 *UND* 00000000 __do_copy_data
which tells us that
u32_bcd Function is 0x8c = 140 byes long,
u32_str is 0x88 = 136 bytes long, and the read-only
u32_bcd_bit data
Object is 0x24 = 36 bytes long.
If you don't like hexadecimals, you can use
avr-readelf -s stuff.oinstead, which outputs
Symbol table '.symtab' contains 16 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 00000000 0 NOTYPE LOCAL DEFAULT UND
1: 00000000 0 FILE LOCAL DEFAULT ABS stuff.c
2: 00000000 0 SECTION LOCAL DEFAULT 1
3: 00000000 0 SECTION LOCAL DEFAULT 3
4: 00000000 0 SECTION LOCAL DEFAULT 4
5: 0000003e 0 NOTYPE LOCAL DEFAULT ABS __SP_H__
6: 0000003d 0 NOTYPE LOCAL DEFAULT ABS __SP_L__
7: 0000003f 0 NOTYPE LOCAL DEFAULT ABS __SREG__
8: 00000000 0 NOTYPE LOCAL DEFAULT ABS __tmp_reg__
9: 00000001 0 NOTYPE LOCAL DEFAULT ABS __zero_reg__
10: 00000000 0 SECTION LOCAL DEFAULT 5
11: 00000000 0 SECTION LOCAL DEFAULT 6
12: 00000000 140 FUNC GLOBAL DEFAULT 1 u32_bcd
13: 00000000 36 OBJECT GLOBAL DEFAULT 5 u32_bcd_bit
14: 0000008c 136 FUNC GLOBAL DEFAULT 1 u32_str
15: 00000000 0 NOTYPE GLOBAL DEFAULT UND __do_copy_data
If you wonder why one has two different tools,
readelf and
objdump, in the same toolchain, the main reason is that
objdump uses the GNU Binary File Descriptor library (BFD), as does e.g. the GNU
as assembler; but
readelf parses ELF files directly. If their output ever disagrees, then it is a bug in the BFD library, and the differences will help pinpoint the bug it.
It is worth to compare the proposed solutions with ltoa() from stdlib:
stdlib ultoa() ----------
T: 223.44us
Number: 3114455555
T: 101.00us
Number: 55555
T: 4.75us
Number: 5
66: 000013be 68 FUNC GLOBAL DEFAULT 2 __ultoa_ncheck
80: 000014e4 188 FUNC GLOBAL DEFAULT 2 __ultoa_invert
104: 000013c0 0 NOTYPE GLOBAL DEFAULT 2 __ultoa_common
EDIT: the first version produced digits in the opposite order, i.e. least significant digit leftmost, not rightmost. Fixed.
ultoa() etc. are typically based on a divide-by-ten (or divide-by-radix) approach, i.e.
const char digit[] = "0123456789abcdefghijklmnopqrstuvwxyz";
char *ultoa(unsigned long val, char *dst, int radix)
{
char *d = dst;
do {
*(d++) = digit[val % radix];
val /= radix;
} while (val);
*d = '\0';
// Swap digit order
{
char *b = dst;
char *q = d;
while (b < q) {
char *cb = *b;
char *cq = *(--q);
*(b++) = cq;
*q = cb;
}
}
return d;
}
Here are equivalent 32-bit unsigned and signed integer versions, using a 32-bit:8-bit division with remainder (instead of the ABI __udivmodsi4 32-bit:32-bit division with remainder):
#include <stdint.h>
typedef struct {
uint8_t q;
uint8_t r;
} div8bit;
div8bit divmod8bit_loop(uint8_t r, uint8_t b, const uint8_t v, const uint8_t d)
{
uint8_t q = 0;
while (b) {
r <<= 1;
if (b & v)
r++;
if (r >= d) {
r -= d;
q |= b;
}
b >>= 1;
}
return (div8bit){ .q = q, .r = r };
}
static uint8_t highbit(uint8_t v)
{
uint8_t b = 0x80;
while (!(b & v))
b >>= 1;
return b;
}
// Divide *vptr by d, d >= 1, d < 128. Returns remainder.
uint8_t divmod8bit(uint32_t *const vptr, const uint8_t d)
{
const uint32_t val = *vptr;
if (!val)
return 0;
uint32_t q = 0;
uint8_t b, w;
div8bit t = { .q = 0, .r = 0 };
if ((w = (uint8_t)(val >> 24))) {
b = highbit(w);
goto b3;
} else
if ((w = (uint8_t)(val >> 16))) {
b = highbit(w);
goto b2;
} else
if ((w = (uint8_t)(val >> 8))) {
b = highbit(w);
goto b1;
} else {
b = highbit((uint8_t)(val));
goto b0;
}
b3:
t = divmod8bit_loop(t.r, b, (uint8_t)(val >> 24), d);
q |= (uint32_t)(t.q) << 24;
b = 0x80;
b2:
t = divmod8bit_loop(t.r, b, (uint8_t)(val >> 16), d);
q |= (uint32_t)(t.q) << 16;
b = 0x80;
b1:
t = divmod8bit_loop(t.r, b, (uint8_t)(val >> 8), d);
q |= (uint32_t)(t.q) << 8;
b = 0x80;
b0:
t = divmod8bit_loop(t.r, b, (uint8_t)(val), d);
q |= (uint32_t)(t.q);
*vptr = q;
return t.r;
}
char *u32_str(char *dst, uint32_t val, const char *base, uint8_t radix)
{
uint32_t v = val;
char *d = dst;
uint8_t n = 0;
do {
*(d++) = base[divmod8bit(&v, radix)];
n++;
} while (v);
*d = '\0';
// Swap digit order
n >>= 1;
{
char *b = dst;
char *q = d;
while (n-->0) {
const char cb = *b;
const char cq = *(--q);
*(b++) = cq;
*(q) = cb;
}
}
return d;
}
char *i32_str(char *dst, int32_t val, const char *base, uint8_t radix)
{
if (val < 0) {
*(dst++) = '-';
return u32_str(dst, -val, base, radix);
} else
return u32_str(dst, val, base, radix);
}
u32_str(dst, value, "0123456789", 10) has been verified to produce correct results for all 32-bit unsigned integer values.
In addition to the radix characters string, divmod8bit() and divmod8bit_loop() take 346 bytes, u32_str() 194 bytes, and i32_str() additional 38 bytes, for a total of 572 bytes, using avr-gcc-5.4.0 -Os -mmcu=atmega328p.
Standard 32-bit:32-bit unsigned integer binary division with remainder can be written as
#include <stdint.h>
typedef struct {
uint32_t quotient;
uint32_t remainder;
} u32_qr;
u32_qr u32_divmod(uint32_t n, uint32_t d)
{
// Check for division by zero.
if (!d)
return (u32_qr){ .quotient = 0, .remainder = 0 };
// Zero numerator must be handled separately. We combine all n < d cases here.
if (n < d)
return (u32_qr){ .quotient = 0, .remainder = n };
// Find highest bit set.
uint32_t bit = (uint32_t)(1) << 31;
while (!(bit & n))
bit >>= 1;
// One iteration for each bit left.
u32_qr result = { .quotient = 0, .remainder = 0 };
while (bit) {
result.remainder <<= 1;
result.remainder |= !!(n & bit); // 1 if bit set, 0 otherwise.
if (result.remainder >= d) {
result.remainder -= d;
result.quotient |= bit;
}
bit >>= 1;
}
return result;
}
Because remainder is at most one bit larger than divisor, and the divisor in these radix calculations is always less than 128, the AVR version of divmod8bit() splits the calculation into four steps, each step working on a single byte. It should speed up the division, without increasing the function size much.