What is the fastest (in clock cycles) division algorithm that will run on an ATMEGA1284?
- The dividend is an unsigned 16-bit number passed into the algorithm in a pair of 8-bit registers.
- The divisor is an unsigned 16-bit number passed into the algorithm in a pair of 8-bit registers.
- The quotient and remainder are passed out in any pairs of 8-bit registers.
- The algorithm code (plus any lookup tables) must consume less than 5K bytes of flash memory.
- The algorithm may return any values for division by 0.
AVR Instruction Set Manual:
https://ww1.microchip.com/downloads/en/devicedoc/atmel-0856-avr-instruction-set-manual.pdf
AVR200: Multiply and Divide Routines:
https://ww1.microchip.com/downloads/en/AppNotes/doc0936.pdf
The algorithm I have so far takes between 57 and 68 clock cycles and uses 768 bytes of lookup tables. I ran an exhaustive 16-hour test that verified that it calculates the correct quotient and remainder for all 2^32 combinations of dividend and divisor.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;ARGUMENTS: r16, r17, r18, r19
; r16:r17 = N (numerator)
; r18:r19 = D (divisor)
;RETURNS: r20, r21
; r20:r21 (quotient)
; r22:r23 (remainder)
;
;DESCRIPTION: divides an unsigned 16 bit number N by unsigned 16 bit divisor D
;
;ALGORITHM OVERVIEW
;
;RZERO = 0;
;if(D < 256){
; N2 = (N * ((R1H_TBL[D] << 8) + R1L_TBL[D])) >> 16;
; P = N2 * D
;}else{
; D1 = (R1H_TBL[D] * D) >> 8
; N1 = (R1H_TBL[D] * N) >> 8
; if(D1 < 256){
; N2 = N1 >> 8;
; }else{
; N2 = N2 * R2_TBL[D1 & 0xFF];
; }
; P = N2 * D;
; if(P > 65535){
; N2 = N2 - 1 ;//Decrement quotient
; N1 = N2 - P + D;//Calculate remainder
; return;//return quotient in N2, remainder in N1
; }
;}
;N1 = N - P;
;if(P > N){
; N2 = N2 - 1;//decrease quotient
; N1 = N1 + D;//increase reamainder
; return;//return quotient in N2, remainder in N1
;}
;if(N1 > D){
; N2 = N2 + 1;
; N1 = N1 - D;
; return;//return quotient in N2, remainder in N1
;}
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.align 256
;Recipricol table #1, high byte.
;R1H_TBL[x] = min( high(2^16/x) / 256 , 255)
R1H_TBL:
.db 0xFF, 0xFF, 0x80, 0x55, 0x40, 0x33, 0x2A, 0x24, 0x20, 0x1C, 0x19, 0x17, 0x15, 0x13, 0x12, 0x11
.db 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0C, 0x0B, 0x0B, 0x0A, 0x0A, 0x09, 0x09, 0x09, 0x08, 0x08, 0x08
.db 0x08, 0x07, 0x07, 0x07, 0x07, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x05, 0x05, 0x05, 0x05, 0x05
.db 0x05, 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04
.db 0x04, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
.db 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
.db 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
.db 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
.db 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
.db 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
.db 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
.db 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
.db 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
.db 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
.db 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
.db 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
;Recipricol table #1, low byte.
;R1L_TBL[x] = min( low(2^16/x) mod 256 , 255)
R1L_TBL:
.db 0xFF, 0xFF, 0x00, 0x55, 0x00, 0x33, 0xAA, 0x92, 0x00, 0x71, 0x99, 0x45, 0x55, 0xB1, 0x49, 0x11
.db 0x00, 0x0F, 0x38, 0x79, 0xCC, 0x30, 0xA2, 0x21, 0xAA, 0x3D, 0xD8, 0x7B, 0x24, 0xD3, 0x88, 0x42
.db 0x00, 0xC1, 0x87, 0x50, 0x1C, 0xEB, 0xBC, 0x90, 0x66, 0x3E, 0x18, 0xF4, 0xD1, 0xB0, 0x90, 0x72
.db 0x55, 0x39, 0x1E, 0x05, 0xEC, 0xD4, 0xBD, 0xA7, 0x92, 0x7D, 0x69, 0x56, 0x44, 0x32, 0x21, 0x10
.db 0x00, 0xF0, 0xE0, 0xD2, 0xC3, 0xB5, 0xA8, 0x9B, 0x8E, 0x81, 0x75, 0x69, 0x5E, 0x53, 0x48, 0x3D
.db 0x33, 0x29, 0x1F, 0x15, 0x0C, 0x03, 0xFA, 0xF1, 0xE8, 0xE0, 0xD8, 0xD0, 0xC8, 0xC0, 0xB9, 0xB1
.db 0xAA, 0xA3, 0x9C, 0x95, 0x8F, 0x88, 0x82, 0x7C, 0x76, 0x70, 0x6A, 0x64, 0x5E, 0x59, 0x53, 0x4E
.db 0x49, 0x43, 0x3E, 0x39, 0x34, 0x30, 0x2B, 0x26, 0x22, 0x1D, 0x19, 0x14, 0x10, 0x0C, 0x08, 0x04
.db 0x00, 0xFC, 0xF8, 0xF4, 0xF0, 0xEC, 0xE9, 0xE5, 0xE1, 0xDE, 0xDA, 0xD7, 0xD4, 0xD0, 0xCD, 0xCA
.db 0xC7, 0xC3, 0xC0, 0xBD, 0xBA, 0xB7, 0xB4, 0xB2, 0xAF, 0xAC, 0xA9, 0xA6, 0xA4, 0xA1, 0x9E, 0x9C
.db 0x99, 0x97, 0x94, 0x92, 0x8F, 0x8D, 0x8A, 0x88, 0x86, 0x83, 0x81, 0x7F, 0x7D, 0x7A, 0x78, 0x76
.db 0x74, 0x72, 0x70, 0x6E, 0x6C, 0x6A, 0x68, 0x66, 0x64, 0x62, 0x60, 0x5E, 0x5C, 0x5A, 0x58, 0x57
.db 0x55, 0x53, 0x51, 0x50, 0x4E, 0x4C, 0x4A, 0x49, 0x47, 0x46, 0x44, 0x42, 0x41, 0x3F, 0x3E, 0x3C
.db 0x3B, 0x39, 0x38, 0x36, 0x35, 0x33, 0x32, 0x30, 0x2F, 0x2E, 0x2C, 0x2B, 0x29, 0x28, 0x27, 0x25
.db 0x24, 0x23, 0x21, 0x20, 0x1F, 0x1E, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x16, 0x15, 0x14, 0x13, 0x12
.db 0x11, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01
;Recipricol table #2
;R2_TBL[x] = min( 2^16/(x+256), 255)
R2_TBL:
.db 0xFF, 0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA, 0xF9, 0xF8, 0xF7, 0xF6, 0xF5, 0xF4, 0xF3, 0xF2, 0xF1
.db 0xF0, 0xF0, 0xEF, 0xEE, 0xED, 0xEC, 0xEB, 0xEA, 0xEA, 0xE9, 0xE8, 0xE7, 0xE6, 0xE5, 0xE5, 0xE4
.db 0xE3, 0xE2, 0xE1, 0xE1, 0xE0, 0xDF, 0xDE, 0xDE, 0xDD, 0xDC, 0xDB, 0xDB, 0xDA, 0xD9, 0xD9, 0xD8
.db 0xD7, 0xD6, 0xD6, 0xD5, 0xD4, 0xD4, 0xD3, 0xD2, 0xD2, 0xD1, 0xD0, 0xD0, 0xCF, 0xCE, 0xCE, 0xCD
.db 0xCC, 0xCC, 0xCB, 0xCA, 0xCA, 0xC9, 0xC9, 0xC8, 0xC7, 0xC7, 0xC6, 0xC5, 0xC5, 0xC4, 0xC4, 0xC3
.db 0xC3, 0xC2, 0xC1, 0xC1, 0xC0, 0xC0, 0xBF, 0xBF, 0xBE, 0xBD, 0xBD, 0xBC, 0xBC, 0xBB, 0xBB, 0xBA
.db 0xBA, 0xB9, 0xB9, 0xB8, 0xB8, 0xB7, 0xB7, 0xB6, 0xB6, 0xB5, 0xB5, 0xB4, 0xB4, 0xB3, 0xB3, 0xB2
.db 0xB2, 0xB1, 0xB1, 0xB0, 0xB0, 0xAF, 0xAF, 0xAE, 0xAE, 0xAD, 0xAD, 0xAC, 0xAC, 0xAC, 0xAB, 0xAB
.db 0xAA, 0xAA, 0xA9, 0xA9, 0xA8, 0xA8, 0xA8, 0xA7, 0xA7, 0xA6, 0xA6, 0xA5, 0xA5, 0xA5, 0xA4, 0xA4
.db 0xA3, 0xA3, 0xA3, 0xA2, 0xA2, 0xA1, 0xA1, 0xA1, 0xA0, 0xA0, 0x9F, 0x9F, 0x9F, 0x9E, 0x9E, 0x9D
.db 0x9D, 0x9D, 0x9C, 0x9C, 0x9C, 0x9B, 0x9B, 0x9A, 0x9A, 0x9A, 0x99, 0x99, 0x99, 0x98, 0x98, 0x98
.db 0x97, 0x97, 0x97, 0x96, 0x96, 0x95, 0x95, 0x95, 0x94, 0x94, 0x94, 0x93, 0x93, 0x93, 0x92, 0x92
.db 0x92, 0x91, 0x91, 0x91, 0x90, 0x90, 0x90, 0x90, 0x8F, 0x8F, 0x8F, 0x8E, 0x8E, 0x8E, 0x8D, 0x8D
.db 0x8D, 0x8C, 0x8C, 0x8C, 0x8C, 0x8B, 0x8B, 0x8B, 0x8A, 0x8A, 0x8A, 0x89, 0x89, 0x89, 0x89, 0x88
.db 0x88, 0x88, 0x87, 0x87, 0x87, 0x87, 0x86, 0x86, 0x86, 0x86, 0x85, 0x85, 0x85, 0x84, 0x84, 0x84
.db 0x84, 0x83, 0x83, 0x83, 0x83, 0x82, 0x82, 0x82, 0x82, 0x81, 0x81, 0x81, 0x81, 0x80, 0x80, 0x80
.def NH = r16 .def NL = r17
.def DH = r18 .def DL = r19
.def N2H = r20 .def N2L = r21
.def N1H = r22 .def N1L = r23
.def PRODL = r0 .def PRODH = r1
.def PH = r2 .def PL = r3
.def D1H = r4 .def D1L = r5
.def RZERO = r6
.def Rx = r7
idivu_16x16_x:
clr RZERO ;1
;Check that DH is not zero
tst DH ;1
brne idivu_16x16_x_dhne ;2
;code for D < 256
idivu_16x8:
;lookup low byte of recipricol into P.
;where P = min(2^16 / D,2^16-1)
mov zl, DL ;1
ldi zh, high(R1L_TBL*2) ;1
lpm PL, Z ;3
ldi zh, high(R1H_TBL*2) ;1
lpm PH, Z ;3
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;calculate N2 = (P * N) >> 16
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; NH:NL
; X RH:RL
;------------------------------------------
; N2H | N2L | N1H | dropped
;----------+----------+----------+---------
; | | H(PL*NL) | L(PL*NL)
; | H(PL*NH) | L(PL*NH) |
; | H(PH*NL) | L(PH*NL) |
; H(PH*NH) | L(PH*NH) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mul NL , PL ;1 PL*NL
mov N1H, PRODH ;1 N1H <= H(PL*NL)
mul NH , PH ;1 PH*NH
mov N2H, PRODH ;1 N2H <= H(PH*NH)
mov N2L, PRODL ;1 N2L <= L(PH*NH)
mul NH , PL ;1 PL*NH
add N1H, PRODL ;1 N1H <= H(PL*NL) + L(PL*NH)
adc N2L, PRODH ;1 N2L <= L(PH*NH) + H(PL*NH)
adc N2H, RZERO ;1 propagate carry to N2H
mul NL , PH ;1 PH*NL
add N1H, PRODL ;1 N1H <= H(PL*NL) + L(PL*NH) + L(PH*NL)
adc N2L, PRODH ;1 N2L <= H(PH*NL) + L(PH*NH) + H(PL*NH)
adc N2H, RZERO ;1 propagate carry to N2H
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;calculate P = N2 * DL ,note DH=0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mul N2L, DL ;1
mov PL, PRODL ;1
mov PH, PRODH ;1
mul N2H, DL ;1
add PH, PRODL ;1
rjmp idivu_16x16_x_adj_n ;2
;code for D >= 256
idivu_16x16_x_dhne:
;Lookup Rx = min(256 / DH, 255)
mov zl, DH ;1
ldi zh, high(R1H_TBL*2) ;1
lpm Rx, Z ;3
;D1 = (D * Rx) >> 8
mul Rx , DH ;1
mov D1L, PRODL ;1
mov D1H, PRODH ;1
mul Rx , DL ;1
add D1L, PRODH ;1
adc D1H, RZERO ;1
;N1 = (D * Rx) >> 8
mul Rx , NH ;1
mov N1L, PRODL ;1
mov N1H, PRODH ;1
mul Rx , NL ;1
add N1L, PRODH ;1
adc N1H, RZERO ;1
;if D1H = 0 then use Rx = 256, otherwise use table
tst D1H ;1
brne idivu_16x16_x_dxhne ;2
mov N2L, N1H ;1
clr N2H ;1
rjmp idivu_16x16_x_checkn;2
idivu_16x16_x_dxhne:
;Lookup Rx = (2 ^ 16) \ (256 + D1H)
mov zl, D1L ;1
ldi zh, high(R2_TBL*2) ;1
lpm Rx, Z ;3
;N2 = (N1 * R2) >> 16
mul Rx , N1H ;1
mov PL , PRODL ;1
mov N2L , PRODH ;1
mul Rx , N1L ;1
add PL , PRODH ;1
adc N2L, RZERO ;1
clr N2H ;1
idivu_16x16_x_checkn:
;Check result (it may be off by +/- 1)
;P = N2 * D
;NOTE: N2 <=255 so NxH = 0, also P < 2^16 so we can discard upper byte of DH * NxL
mul DL , N2L ;1
mov PL, PRODL ;1
mov PH, PRODH ;1
mul DH , N2L ;1
add PH , PRODL ;1
brcc idivu_16x16_x_adj_n ;2
;if multiply overflowed then...
;decrement quotient
;calculate remainder as N - P + D
subi N2L, 0x01 ;1
sbci N2H, 0x00 ;1
mov N1L, NL ;1
mov N1H, NH ;1
sub N1L, PL ;1
sbc N1H, PH ;1
add N1L, DL ;1
adc N1H, DH ;1
rjmp idivu_16x16_x_end ;2
;Adjust result up or down by 1 if needed.
idivu_16x16_x_adj_n:
;Add -P to N, with result in P
mov N1L, NL ;1
mov N1H, NH ;1
sub N1L, PL ;1
sbc N1H, PH ;1
brsh idivu_16x16_x_pltn ;2
idivu_16x16_x_decn2:
;if P > N then decrement quotient, add to remainder
subi N2L, 0x01 ;1
sbci N2H, 0x00 ;1
add N1L, DL ;1
adc N1H, DH ;1
rjmp idivu_16x16_x_end ;2
idivu_16x16_x_pltn:
;test remainder to D
cp N1L, DL ;1
cpc N1H, DH ;1
;if remainder < D then goto end
brlo idivu_16x16_x_end ;2
;if remainder >= D then increment quotient, reduce remainder
subi N2L, 0xFF ;1
sbci N2H, 0xFF ;1
sub N1L, DL ;1
sbc N1H, DH ;1
idivu_16x16_x_end:
ret
.undef NH .undef NL
.undef DH .undef DL
.undef N2H .undef N2L
.undef N1H .undef N1L
.undef PRODL .undef PRODH
.undef PH .undef PL
.undef D1H .undef D1L
.undef RZERO
.undef Rx