For example, I've got this for AVR:
;
; uDivTen
;
; Unsigned division by ten. 45 words, takes 60 cycles (plus rcall).
;
; Input:
; r17:r16 = operand
; Output:
; r1:r0 = quotient
; r3:r2 = remainder
; (r16 = 10, r17 = 0)
;
uDivTen:
push r7 ; register usage:
push r6 ; r7:r6 = operand (in[0..1])
push r5 ; r5:r4:r3:r2 = 40 bit accumulator, D[1..3] (byte 0 discarded)
push r4 ; r1:r0 = partial products
movw r6, r16 ; r16 = temp, r17 = zero
clr r17 ; (but first, save operand)
; multiply by K = 2^20 * 16 / 10 to make D[0..3] = in[0..1] * K[0..2]
; (implicitly discarding D[0])
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16
mul r7, r16 ; in[1] * K[2]
movw r4, r0 ; save high word
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 0)) >> 0
mul r7, r16 ; in[1] * K[0]
movw r2, r0 ; save low word
mul r6, r16 ; in[0] * K[0]
add r2, r1 ; accumulate to D[1] (discard lowest byte)
adc r3, r17
adc r4, r17
adc r5, r17
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 8)) >> 8
mul r6, r16 ; in[0] * K[1]
add r2, r0 ; accumulate to D[1..2]
adc r3, r1
adc r4, r17
adc r5, r17
mul r7, r16 ; in[1] * K[1]
add r3, r0 ; accumulate to D[2..3]
adc r4, r1
adc r5, r17
ldi r16, (((exp2(24) / 10) + 1) & (0x0000ff << 16)) >> 16
mul r6, r16 ; in[0] * K[2]
add r3, r0 ; accumulate to D[2..3]
adc r4, r1
adc r5, r17
; dig remainder out of the fractional part
ldi r16, 0x10 ; rounding bit
add r3, r16
ldi r16, 10
mul r3, r16 ; frac * 10
mov r2, r1
clr r3
movw r0, r4 ; quotient out
; r3 = 0, r2 = [0...9], r1:r0 = [0...6553]
pop r4
pop r5
pop r6
pop r7
ret
; END PROC uDivTen
Downside is it does a 24 bit intermediate operation.
Tim