Login

britlion · 02-13-2010, 01:16 AM

I've got a version that does LONG as well as uInteger later in this thread.

I note that the compiler uses the Spectrum ROM square root routine. This routine is hideously slow. It actually calculates x^(0.5) instead, and takes ages about it. The Newton-Raphson method would be a lot faster, and pretty easy to put in.

If you are willing to sacrifice accuracy, an integer square root would be faster still. For a lot of situations, an integer root would be just fine - for example, if I had to calculate the nearest of two objects on screen, I'm going to have to use pythagoras' theorum to calculate distances. [ A^2 = B^2 + C^2 ] that needs square roots to make a distance. But probably the nearest whole pixel would be a perfectly good enough result!

So, here are two functions, and some code to demonstrate them. One is a perfect replacement for sqr.asm in the library, actually - it's full floating point compatible, 100% accurate, and about 3-6 times faster. It actually uses the FP-Calculator in the rom. It just doesn't use the SQR command. [Note: It comes back with something instead of an error in case of a negative square root request. Boriel - you might want to change that behavor. Not sure.] The integer version... well - look for yourself! I reckon it's about 40 times faster than the fast version.

Also: Should be able to do a something similar for a 32 bit LONG integer.

Copy and compile this program. I hope you like it:

Code:
FUNCTION FASTCALL SQRT (radicand as FLOAT) as FLOAT

ASM

; FLOATS arrive in A ED CB

;A is the exponent.

          AND   A               ; Test for zero argument 

          RET   Z               ; Return with zero.

          ;Strictly we should test the number for being negative and quit if it is.

          ;But let's assume we like imaginary numbers, hmm?

          ; If you'd rather break it change to a jump to an error below.

          ;BIT   7,(HL)          ; Test the bit.

          ;JR    NZ,REPORT       ; back to REPORT_A 

                                ; 'Invalid argument'

          RES 7,E               ; Now it's a positive number, no matter what.

          call __FPSTACK_PUSH   ; Okay, We put it on the calc stack. Stack contains ABS(x)

          ;   Halve the exponent to achieve a good guess.(accurate with .25 16 64 etc.)

                                ; Remember, A is the exponent.

          XOR   $80             ; toggle sign of exponent

          SRA   A               ; shift right, bit 7 unchanged.

          INC   A               ;

          JR    Z,ASIS          ; forward with say .25 -> .5

          JP    P,ASIS          ; leave increment if value > .5

          DEC   A               ; restore to shift only.

ASIS:     XOR   $80             ; restore sign.

          call __FPSTACK_PUSH   ; Okay, NOW we put the guess on the stack

          rst  28h    ; ROM CALC    ;;guess,x

          DEFB $C3              ;;st-mem-3              x,guess

          DEFB $02              ;;delete                x 

SLOOP:    DEFB  $31             ;;duplicate             x,x.

          DEFB  $E3             ;;get-mem-3             x,x,guess

          DEFB  $C4             ;;st-mem-4              x,x,guess

          DEFB  $05             ;;div                   x,x/guess.

          DEFB  $E3             ;;get-mem-3             x,x/guess,guess          

          DEFB  $0F             ;;addition              x,x/guess+guess          

          DEFB  $A2             ;;stk-half              x,x/guess+guess,.5

          DEFB  $04             ;;multiply              x,(x/guess+guess)*.5

          DEFB  $C3             ;;st-mem-3              x,newguess

          DEFB  $E4             ;;get-mem-4             x,newguess,oldguess

          DEFB  $03             ;;subtract              x,newguess-oldguess

          DEFB  $2A             ;;abs                   x,difference.

          DEFB  $37             ;;greater-0             x,(0/1).

          DEFB  $00             ;;jump-true             x.

          DEFB  SLOOP - $       ;;to sloop              x.

          DEFB  $02             ;;delete                .

          DEFB  $E3             ;;get-mem-3             retrieve final guess.

          DEFB  $38             ;;end-calc              sqr x.

          jp __FPSTACK_POP

END ASM

END FUNCTION

FUNCTION FASTCALL SQRT16(radicand as uInteger) as uByte

asm

    XOR A

    AND A

    ld  a,l

    ld  l,h

    ld    de,0040h    ; 40h appends "01" to D

    ld    h,d

    ld b,7

sqrt16loop:

    sbc    hl,de        ; IF speed is critical, and you don't mind spending the extra bytes, you could unroll this loop 7 times instead of DJNZ.

    jr    nc,$+3    

    add    hl,de        

    ccf            

    rl    d        

    rla            

    adc    hl,hl        

    rla            

    adc    hl,hl        

    DJNZ sqrt16loop

    sbc    hl,de        ; optimised last iteration

    ccf

    rl    d

    ld a,d

end asm

END FUNCTION

FUNCTION t AS ULONG

   RETURN INT((65536 * PEEK (23674) + 256 * PEEK(23673) + PEEK (23672)))

END FUNCTION

CLS

DIM a,b as float

DIM i as uInteger

DIM time as long

PRINT "ROM","FAST"

REM show it's as accurate

for i=1 to 15

    LET a=rnd * 32768

    PRINT SQR(a),SQRT(a)

next i

PRINT

PRINT "Over 500 Cycles:"

PRINT

REM ROM version 500 times.

LET time=t()

for i=1 to 500

    b=SQR(a)

next i

PRINT "Rom routine: ";t()-time;" Frames."

REM MY version 500 times.

LET time=t()

for i=1 to 500

    b=SQRT(a)

next i

PRINT "Fast routine: ";t()-time;" Frames."

PRINT

PRINT "PRESS A KEY"

PAUSE 1: PAUSE 0

CLS

PRINT "NUM   FAST      INTEGER"

REM show it's as accurate

for i=1 to 15

    LET a=INT(rnd * 32768)

    PRINT a;TAB 6;SQRT(a);TAB 16;SQRT16(a)

next i

PRINT

PRINT "Over 500 Cycles:"

PRINT

REM MY version 500 times.

LET time=t()

for i=1 to 500

    b=SQRT(a)

next i

PRINT "Fast routine: ";t()-time;" Frames."

REM MY Integer version 500 times.

LET time=t()

for i=1 to 500

    b=SQRT16(a)

next i

PRINT "Integer routine: ";t()-time;" Frames."

***boriel*** · 02-19-2010, 07:33 PM

britlion Wrote:I note that the compiler uses the Spectrum ROM square root routine. This routine is hideously slow. It actually calculates x^(0.5) instead, and takes ages about it. The Newton-Raphson method would be a lot faster, and pretty easy to put in.

If you are willing to sacrifice accuracy, an integer square root would be faster still. For a lot of situations, an integer root would be just fine - for example, if I had to calculate the nearest of two objects on screen, I'm going to have to use pythagoras' theorum to calculate distances. [ A^2 = B^2 + C^2 ] that needs square roots to make a distance. But probably the nearest whole pixel would be a perfectly good enough result!

If there's no "code license restrictions" (and I guess not), these functions can be added "as is" in the compilers library, and inserted with an #include directive.
What do you think? :roll:

britlion · 02-19-2010, 08:18 PM

I think that's a good idea.

We need a section of the wiki to cover library additions then.

Note that the float square root I wrote is a perfect replacement for the one in the asm library already - I suppose if people want to save the last few bytes they could use the original (it is shorter), but much slower.

Then again, if the optimization is working correctly, if SQR isn't used in the program, the library routine for it wouldn't be in anyway, yes?

***boriel*** · 02-19-2010, 09:59 PM

britlion Wrote:I think that's a good idea.

We need a section of the wiki to cover library additions then.

Note that the float square root I wrote is a perfect replacement for the one in the asm library already - I suppose if people want to save the last few bytes they could use the original (it is shorter), but much slower.

Then again, if the optimization is working correctly, if SQR isn't used in the program, the library routine for it wouldn't be in anyway, yes?

For SQR I'm just calling the ROM (to save memory). That's why I always use the ROM for FP calculations.

A wiki section for external functions is a very good idea. In fact, it was planned: Look here <a class="postlink" href="http://www.boriel.com/wiki/en/index.php/ZXBasic">http://www.boriel.com/wiki/en/index.php/ZXBasic</a> on the lower-right square ;-)

britlion · 02-19-2010, 11:17 PM

My floating point Square rooot routine (Function SQRT in the above code) is written so that it can replace sqrt.asm in the library.asm folder. It uses the ROM calculator too - but it doesn't call the SQR routine in it because it's really slow! It uses the calculator in a way that the ROM should have. It's at least triple the speed of the original, for a handful of bytes. Some modified ROMs use this method, I understand. I designed that function so it should plug straight in to your sqrt.asm file directly. It even calls the calculator stack routine you have.

By my count your version is 9 bytes, and my version is just 47 bytes. Probably not too big a sacrifice for the speed. Especially if it's not included if SQR isn't used.

My recommendation is to adopt my floating point one as the standard for the compiler, and have integer as a #include option. Though we probably need one that works with LONG data type as well.

***boriel*** · 02-19-2010, 11:29 PM

britlion Wrote:My floating point Square rooot routine (Function SQRT in the above code) is written so that it can replace sqrt.asm in the library.asm folder. It uses the ROM calculator too - but it doesn't call the SQR routine in it because it's really slow! It uses the calculator in a way that the ROM should have. It's at least triple the speed of the original, for a handful of bytes. Some modified ROMs use this method, I understand. I designed that function so it should plug straight in to your sqrt.asm file directly. It even calls the calculator stack routine you have.

By my count your version is 9 bytes, and my version is just 47 bytes. Probably not too big a sacrifice for the speed. Especially if it's not included if SQR isn't used.

My recommendation is to adopt my floating point one as the standard for the compiler, and have integer as a #include option. Though we probably need one that works with LONG data type as well.

I 47bytes can be really much! But I was thinking in an alternative solution. By using a --fast-float flag, this and other alternatives floating points routines will be uses in place of the standard ones. This could be easily acomplished by adding an #ifdef ... in the libasm/sqrt.asm file ;-)

britlion · 02-20-2010, 01:04 AM

Well, apparently, digging into the Tobos compiler looks like it might be fruitful for fast floating point routines - though they will eat space in RAM, yes.

The flag is a good idea.

How about something like -Opt-Size and -Opt-Speed to let the compiler choose the smallest or the fastest code available for functions?

Circle and draw could probably go the same way, for starters.

britlion · 02-20-2010, 01:06 AM

boriel Wrote:I 47bytes can be really much!

Just to be clear, it's only 47, not 147! Only 36 bytes more Wink

***boriel*** · 02-20-2010, 01:08 AM

britlion Wrote:Well, apparently, digging into the Tobos compiler looks like it might be fruitful for fast floating point routines - though they will eat space in RAM, yes.

The flag is a good idea.

How about something like -Opt-Size and -Opt-Speed to let the compiler choose the smallest or the fastest code available for functions?

Circle and draw could probably go the same way, for starters.

Hmmm. CIRCLE I think is already fast enough (it uses bresenham's integer sums) Tongue

Regarding to DRAW, it also uses Bresenham's integer algoritm, but, I have a faster DRAW in mind; however it will take even more memory. I use ROM's plot subroutine

britlion · 02-20-2010, 01:11 AM

Faster = Good for me :-)

Faster plot might be nice, too.

I was thinking that CIRCLE could be smaller if you used the (agonizingly slow) Rom routine - and the same with Draw. I doubt you could make them inherently faster; though a faster plot routine would help a lot, yes.

***boriel*** · 02-20-2010, 01:23 AM

britlion Wrote:Faster = Good for me :-)

Faster plot might be nice, too.

I was thinking that CIRCLE could be smaller if you used the (agonizingly slow) Rom routine - and the same with Draw. I doubt you could make them inherently faster; though a faster plot routine would help a lot, yes.

Unfortunately, I can't use neither ROM CIRCLE nor DRAW routine (I tried hard), because these routines mixed Sinclair BASIC parsing with execution. Calling them will trigger ROM parsing routines crashing the system. :?

britlion · 02-20-2010, 01:42 AM

Interesting. I didn't know that.

Hisoft Basic compiled code either uses the ROM or something identical to calculate the circles then. Huh.

I just found <a class="postlink" href="http://www.andreadrian.de/oldcpu/Z80_number_cruncher.html">http://www.andreadrian.de/oldcpu/Z80_nu ... ncher.html</a>, and then noticed you've borrowed code from that. I'm looking at the 16.16 routines with interest. Might be able to work those up as functions for a library, on the assumption that users looking for speed will be willing to sacrifice accuracy, and use FIXED type numbers.

britlion · 02-24-2010, 07:33 PM

Okay, integer square roots - this is the big version. It can deal with LONG integer input.

And right now, I never want to see a four-register 32 bit number again *lol* hooboy are they a pain. I found out the hard way, for example, that "inc HL" doesn't flag overflows. It all boils down to a lot of tests and jumps to allow for lower 16 bits flowing into upper and vice versa.

This version is 148 bytes of code. It incorporates the 16 bit version as an alternative super-fast routine. This version is about 50 times faster than the ROM routine for numbers > 65535 and over 100 times faster for numbers <= 65535. It only returns integer square roots (see http://en.wikipedia.org/wiki/Integer_square_root for details) - but for game purposes, these are generally fast enough. If you want floating point numbers, I've also published a routine about 6 times faster than the usual rom one; though, ironically, that uses the rom calculator; but uses a far better algorithm than the actual rom call.

If you want to reduce the size by about 1/3, you can cut the jump to 16 bit off the start, and cut out the 16 bit section at the end. It's still going to be 50 times faster than the ROM.

I called it sqrtLF - for Square Root Long Fast. But you can rename it if you want. Just sqrt isn't a bad name. You can't call it sqr though, as that is a reserved word.

Here you go:

Code:
FUNCTION FASTCALL sqrtLF (num as uLong) as uInteger

REM incoming is DEHL

REM outbound is HL

asm

LD A,D

OR E

JP Z, sqrtLF16bit ; we're inside a 16 bit number. We can use the faster version.

LD b,16 ; b times round

EXX ; Out to root and rem - we're doing most of this in alternate registers.

 LD DE,0

 LD HL,0 ; DEHL = remainder

 LD BC,0 ; BC = root

EXX   ;back to num and loop

sqrtLFasmloop:

EXX  ; out to root and rem

SLA  C ; root <<= 1

RL  B   ;

SLA L ; rem=rem<<1

RL  H  ;

RL  E    ;

RL  D     ;

NOP

SLA L ; rem=rem<<1

RL  H  ;

RL  E    ;

RL  D     ;

EXX  ; back to Num and loop 

LD a,d    ; A = inputnum>>30

AND 192

RLCA

RLCA

SLA  L ; num <<= 1

RL  H

RL  E

RL  D

SLA  L ; num <<= 1

RL  H

RL  E

RL  D

EXX  ; out to root and rem

  ADD A,L     ; a=a+L              ; REM=REM+num>>30

  LD L,A      ; a-> L               ;

  JR NC, sqrtLFasmloophop1           ;

  INC H

  JR NC, sqrtLFasmloophop1

  INC DE           ;                               ;

sqrtLFasmloophop1:

  INC BC                       ; root=root+1

sqrtLFasmloophop2:

  ; DEHL = Remainder

  ; BC = root

  ; if rem >= root then

  LD A,D

  OR E

  JR NZ, sqrtLFasmthen ; if rem > 65535 then rem is definitely > root and we go to true

  LD A, H

  CP B

  JR C, sqrtLFasmelse ; H<B - that is rem<root so rem>=root is false and we go to else 

  JR NZ, sqrtLFasmthen ; H isn't zero though, so we could do a carry from it, so we're good to say HL is larger.

  ; if h is out, then it's down to L and C 

  LD A,L

  CP C

  JR C, sqrtLFasmelse ; L<C - that is rem<root so rem>=root is false and we go to else

  ; must be true - go to true.

  sqrtLFasmthen:

  ;remainder=remainder-root

  AND A ; clear carry flag

  SBC HL,BC ; take root away from the lower half of rem.

  JP NC, sqrtLFasmhop3 ; we didn't take away too much, so we're okay to loop round.

  ; if we're here, we did take away too much. We need to borrow from DE

  DEC DE ; borrow off DE

  sqrtLFasmhop3:

  INC BC ;root=root+1

  JP sqrtLFasmloopend

  ;else 

  sqrtLFasmelse:

  DEC BC ;root=root-1

  ;end if

sqrtLFasmloopend:

EXX  ; back to num

  DJNZ sqrtLFasmloop

EXX ; out to root and rem

  PUSH BC

EXX ; back to normal

POP HL

SRA  H

RES 7,H

RR  L       ; Hl=HL/2 - root/2 is the answer.

jr sqrtLFexitFunction

sqrtLF16bit:

ld  a,l

ld  l,h

ld   de,0040h   ; 40h appends "01" to D

ld   h,d

ld b,7

sqrtLFsqrt16loop:

  sbc   hl,de      ; IF speed is critical, and you don't mind spending the extra bytes, you could unroll this loop 7 times instead of DJNZ.

  jr    nc,$+3   

  add   hl,de      

  ccf         

  rl    d      

  rla         

  adc   hl,hl      

  rla         

  adc   hl,hl      

DJNZ sqrtLFsqrt16loop

sbc   hl,de      ; optimised last iteration

ccf

rl   d

ld h,0

ld l,d 

ld de,0

sqrtLFexitFunction:   

end asm

END FUNCTION

Login
Username:
Password:	Lost Password?
	Remember me