Here is a proper suggestion for ATtiny85, for a 32-bit TIMER0 counter:
#include <stdint.h>
static volatile uint8_t avr_timer_updates[2];
static volatile uint32_t avr_timer_counter;
extern uint32_t get_timer(void);
extern uint32_t get_timer_coarse(void);
with the TIMER0 overflow interrupt, get_timer(), and get_timer_coarse() functions implemented in assembly in asm-timer0.s:
.file "asm-timer0.s"
; SPDX-License-Identifier: CC0-1.0
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__SREG__ = 0x3f
__tmp_reg__ = 0
__zero_reg__ = 1
.text
;
; timer0 overflow interrupt vector
;
.global __vector_5
.type __vector_5, @function
__vector_5:
; ISR prolog
push r1
push r0
in r0, __SREG__
push r0
clr __zero_reg__
push r20
push r19
push r18
lds r20, avr_timer_updates+1
inc r20
sts avr_timer_updates+1, r20
in r19, 0x29 ; OCR0A
lds r18, avr_timer_counter+0
add r18, r19
sts avr_timer_counter+0, r18
brcc .done
lds r18, avr_timer_counter+1
inc r18
sts avr_timer_counter+1, r18
brne .done
lds r18, avr_timer_counter+2
inc r18
sts avr_timer_counter+2, r18
brne .done
lds r18, avr_timer_counter+3
inc r18
sts avr_timer_counter+3, r18
.done:
sts avr_timer_updates+0, r20
pop r18
pop r19
pop r20
; ISR epilog
pop r0
out __SREG__, r0
pop r0
pop r1
reti
.size __vector_5, .-__vector_5
.global get_timer
.type get_timer, @function
get_timer:
lds r21, avr_timer_updates+0
lds r22, avr_timer_counter+0
lds r23, avr_timer_counter+1
lds r24, avr_timer_counter+2
lds r25, avr_timer_counter+3
in r18, 0x32 ; r18 = TCNT0
in r19, 0x38 ; r19 = TIFR
sbrc r19, 1 ; TOV0
rjmp get_timer
lds r20, avr_timer_updates+1
cpse r20, r21
rjmp get_timer
add r22, r18
adc r23, __zero_reg__
adc r24, __zero_reg__
adc r25, __zero_reg__
ret
.size get_timer, .-get_timer
.global get_timer_coarse
.type get_timer_coarse, @function
get_timer_coarse:
lds r21, avr_timer_updates+0
lds r22, avr_timer_counter+0
lds r23, avr_timer_counter+1
lds r24, avr_timer_counter+2
lds r25, avr_timer_counter+3
lds r20, avr_timer_updates+1
cpse r20, r21
rjmp get_timer
ret
.size get_timer_coarse, .-get_timer_coarse
.comm avr_timer_counter,4,1
.comm avr_timer_updates,2,1
Just feed that asm-timer0.s file to avr-gcc as if it was a C file. Not tested on actual ATtiny85 hardware, but it does compile using old avr-gcc-4.9.2 (avr-gcc-4.9.2 -Wall -mmcu=attiny85 -c asm-timer0.s), and the logic is sound, but do beware of bugs.
The idea is that whenever an overflow interrupt occurs, the avr_timer_counter value is incremented by OCR0A. The second of the avr_timer_updates[2] bytes is incremented before the counter is incremented, and the first after the counter is incremented, so that readers can spin if an interrupt occurs.
The get_timer() function adds TCNT0 to the counter value, so the result is essentially the 32-bit TIMER0 virtual counter. It uses both the avr_timer_updates[2] guard bytes, and TOV0 bit in TIFR to detect if the combined 32-bit counter is valid. If interrupts occur too often, it might spin forever; so test before use.
The get_timer_coarse() function omits the TCNT0 and TOV0 bit checks, and so is more lightweight, although the value is coarser. Although the timers are derived from the same source, you should not mix the values, unless you are prepared for get_timer_coarse() < get_timer() even if obtained at the very same moment somehow.
The timer ISR itself is a bit tricky, as (256/OCR0A) of ticks only take 25 instructions (I didn't bother to calculate cycle counts), and uses only six bytes of stack. Of the other cases, it takes 29, 33, or 36 instructions. If the jitter a variable-duration TIMER0 ISR is problematic, the code can be changed to fixed 33 instructions instead (no jumps nor conditional jumps), using
;
; timer0 overflow interrupt vector
;
.global __vector_5
.type __vector_5, @function
__vector_5:
; ISR prolog
push r1
push r0
in r0, __SREG__
push r0
clr __zero_reg__
push r20
push r19
push r18
lds r20, avr_timer_updates+1
inc r20
sts avr_timer_updates+1, r20
in r19, 0x29 ; OCR0A
lds r18, avr_timer_counter+0
add r18, r19
sts avr_timer_counter+0, r18
lds r18, avr_timer_counter+1
adc r18, __zero_reg__
sts avr_timer_counter+1, r18
lds r18, avr_timer_counter+2
adc r18, __zero_reg__
sts avr_timer_counter+2, r18
lds r18, avr_timer_counter+3
adc r18, __zero_reg__
sts avr_timer_counter+3, r18
sts avr_timer_updates+0, r20
pop r18
pop r19
pop r20
; ISR epilog
pop r0
out __SREG__, r0
pop r0
pop r1
reti
.size __vector_5, .-__vector_5
I have no idea which one performs better in practice.
Note that lds+adc+sts pattern that uses only one register to update all bytes in a multibyte integer works, because neither lds nor sts modify the carry flag; only adc does. I think avr-gcc only uses N registers for N-byte integers, because that way the externally visible change occurs in one short window, shortening race windows. In my case, using the two generation/update counters and spinning until the match avoid any need for that. In an ISR, it is useful because it lessens the amount of stack used. Probably could reduce the stack use even more, but I just grabbed the ISR prolog and epilog from what avr-gcc generates for ISR(TIMER0_OVF_vect) { ... } when using #include <avr/interrupt.h>.