Here a Fast SQRT (Square Root) Calculator using the Babylonian method:

For more informations visit this link:
http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Babylonian_method
Follows the code below:

For more informations visit this link:
http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Babylonian_method
Follows the code below:
- Code:
; ---------------------------------------------------------------------------
; Subroutine to calculate the Square Root with the Babylonian method
; d0 = input longword = any value until $FFFFFFFF
; d0 = output word = a root until $FFFF
; ---------------------------------------------------------------------------
; ||||||||||||||| S U B R O U T I N E |||||||||||||||||||||||||||||||||||||||
SquareRoot: ; arg (32 bit) & result (16 bit) in D0
movem.l d1-d3,-(sp) ; save the variables that will be changed in this calculation
move.l d0,d3 ; save S in d3
swap d0 ; swap to tst if is <= 65535 (0000FFFF -> FFFF0000)
tst.w d0 ; is <= 65535?
beq.s .16bit ; if yes, this number is 16bit or 8bit
cmpi.w #255,d0 ; is <= 255 (00FF0000 -> 000000FF)?
bls.s .24bit ; if yes, this number is 24bit
; --------------------------------------------------------
.32bit:
cmp.w #$FFFD,d0 ; the numer is >= FFFD0000?
bcc.s max_result ; if yes, result = 0000FFFF
; in the first Iteration, we will use 2^n as x0
; this calculation is to get the n,
; when we pass the loop (FFFF+y = a number < FFFF; ie, y+y = number < y)
; this was commented as "greater than 65535"
moveq #16,d1 ; in 32bit the n = 16 in start (2^16=65536, max result of a 32bit square root)
add.w d0,d0 ; *2 (shift 1 bit to left, more faster than the lsl)
bcs.s START ; if the result is greater than 65535 (because the swap FFFF0000 -> 0000FFFF),
; start the square root with n = 16
subq.w #1,d1 ; if not, sub 1 of n
Norm32:
add.w d0,d0 ; *2 (shift 1 bit to left)
bcs.s START ; if the result is low than d0 (greater than 65535), start the square root
add.w d0,d0 ; *2 (shift 1 bit to left)
dbcs d1,Norm32 ; if the result is greater than d0 (low than 65535), loop and sub 1 of n
bra.s START ; if is greater than 65535, start the square root
max_result:
moveq #0,d0 ; result = 00000000
subq.w #1,d0 ; result = 0000FFFF
movem.l (sp)+,d1-d3 ; load the variables that were modified
rts
; --------------------------------------------------------
.24bit:
; in the first Iteration, we will use 2^n as x0
; this calculation is to get the n,
; when we pass the loop (FF+y = a number < FF; ie, y+y = number < y)
; this was commented as "greater than 255"
moveq #12,d1 ; in 24bit the n = 12 in start (2^12=4096, max result of a 24bit square root)
add.b d0,d0 ; *2 (shift 1 bit to left, more faster than the lsl)
bcs.s START ; if the result is greater than 255 (because the swap 00FF0000 -> 000000FF),
; start the square root with n = 12
subq.w #1,d1 ; if not, sub 1 of n
Norm24:
add.b d0,d0 ; *2 (shift 1 bit to left)
bcs.s START ; if the result is low than d0 (greater than 255), start the square root
add.b d0,d0 ; *2 (shift 1 bit to left)
dbcs d1,Norm24 ; if the result is greater than d0 (low than 255), loop and sub 1 of n
; if is greater than 255, start the square root
; --------------------------------------------------------
START: ; first Iteration // x1 = (S/x0 + x0) / 2
move.l d3,d2 ; copy S to d2
lsr.l d1,d2 ; S/2^n (n=d1)
moveq #0,d0 ; clear d0
bset d1,d0 ; set d0=2^n
addx.l d2,d0 ; S/x0 + x0
lsr.l d0 ; /2
; now d0=x1
subq.l #1,d3 ; S=S-1, for better results
; second Iteration // x2 = (S/x1 + x1) / 2
move.l d3,d1 ; copy S to d1
divu d0,d1 ; S/x1
addq.w #1,d0 ; LSB rounded, no overflow possible here!
add.w d1,d0 ; S/x1 + x1
roxr.w d0 ; /2 (asr, div and lsr can cause a ZERO DIVIDE. ex: When S = 2^30)
; now d0=x2
; third Iteration // x3 = (S/x2 + x2) / 2
; large values need more iterations
divu d0,d3 ; S/x2
addq.w #1,d0 ; LSB rounded
add.w d3,d0 ; S/x2 + x2
roxr.w d0 ; /2 (asr, div and lsr can cause a ZERO DIVIDE)
START_rts:
movem.l (sp)+,d1-d3 ; load the variables that were modified
rts
; --------------------------------------------------------
.16bit:
swap d0 ; return the value to original state (____0000 -> 0000____)
cmpi.w #255,d0 ; is <= 255 (000000FF)?
bls.s .8bit ; if yes, this number is 8bit
; in the first Iteration, we will use 2^n as x0
; this calculation is to get the n,
; when we pass the loop (FFFF+y = a number < FFFF; ie, y+y = number < y)
; this was commented as "greater than 65535"
moveq #8,d1 ; in 16bit the n = 8 in start (2^8=256, max result of a 16bit square root)
add.w d0,d0 ; *2 (shift 1 bit to left, more faster than the lsl)
bcs.s STARTS ; if the result is greater than 65535, start the square root with n = 8
subq.w #1,d1 ; if not, sub 1 of n
Norm16:
add.w d0,d0 ; *2 (shift 1 bit to left)
bcs.s STARTS ; if the result is low than d0 (greater than 65535), start the square root
add.w d0,d0 ; *2 (shift 1 bit to left)
dbcs d1,Norm16 ; if the result is greater than d0 (low than 65535), loop and sub 1 of n
bra.s STARTS ; if is greater than 65535, start the square root
; --------------------------------------------------------
.8bit:
cmp.w #1,d0 ; is 0 or 1?
bls.s START_rts ; if yes, return with sqrt of 0 or 1
; in the first Iteration, we will use 2^n as x0
; this calculation is to get the n,
; when we pass the loop (FF+y = a number < FF; ie, y+y = number < y)
; this was commented as "greater than 255"
moveq #4,d1 ; in 8bit the n = 4 in start (2^4=16, max result of a 8bit square root)
add.b d0,d0 ; *2 (shift 1 bit to left, more faster than the lsl)
bcs.s STARTS ; if the result is greater than 255, start the square root with n = 4
subq.w #1,d1 ; if not, sub 1 of n
Norm8:
add.b d0,d0 ; *2 (shift 1 bit to left)
bcs.s STARTS ; if the result is low than d0 (greater than 255), start the square root
add.b d0,d0 ; *2 (shift 1 bit to left)
dbcs d1,Norm8 ; if the result is greater than d0 (low than 255), loop and sub 1 of n
; if is greater than 255, start the square root
; --------------------------------------------------------
STARTS: ; first Iteration // x1 = (S/x0 + x0) / 2
move.l d3,d2 ; copy S to d2
lsr.w d1,d2 ; S/2^n (n=d1)
moveq #0,d0 ; clear d0
bset d1,d0 ; set d0=2^n
addx.w d2,d0 ; S/x0 + x0
lsr.w d0 ; /2
; now d0=x1
subq.w #1,d3 ; S=S-1, for better results
; second Iteration // x2 = (S/x1 + x1) / 2
divu d0,d3 ; S/x1
addq.w #1,d0 ; LSB rounded
add.w d3,d0 ; S/x1 + x1
roxr.w d0 ; /2 (asr, div and lsr can cause a ZERO DIVIDE)
movem.l (sp)+,d1-d3 ; load the variables that were modified
rts
; End of function SquareRoot
; ===========================================================================