#include "asm.h" /* ** This is a non-restoring version of the bit-by-bit square root ** routine. Instead of adding back 2*a*2**n+2**(2*n) after finding ** a zero bit, the negative remainder is kept into the next iteration. ** At that, instead of subtracting 2*a*2**(n-1)+2**(2*n-2) from the ** remainder, we add 2*a*2**n+2**(2*n)-(2*a*2**(n-1)+2**(2*n-2)), ** which simplifies to 2*a*2**(n-1)+3*2**(2*n-2). */ .text // unsigned long sqroot(unsigned long long v); .global sqrt64 sqrt64: movel d2,-(sp) movel d3,-(sp) movel 12(sp),d1 // Upper longword movel 16(sp),d2 movel #0x40000000,d0 // Initial trial bit movel #0x30000000,d3 // Counter, adjust value // First iteration: subl d0,d1 // Trial subtraction jcs msb_is_0 bset #28,d0 // Fixup for next trial lsrl #1,d3 jra trial_sub msb_is_0: movel d3,d0 // Fixup for restore/subtract lsrl #1,d3 jra rstor_sub // Here are the central 29 iterations: // xx11xx: adj_11: addl d3,d0 // d0 = 2a + b (xx1010+011 -> xx1101) asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder lsrl #1,d3 // Advance to next lower bit position jcs ffb_1 trial_sub: subl d0,d1 // Trial subtraction jcc adj_11 // Success, loop on // xx10xx: orl d3,d0 // d0 = 2a + 3b (xx1010|011 -> xx1011 asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder lsrl #1,d3 // Advance to next lower bit position jcs ffb_0 rstor_sub: addl d0,d1 // Restore & trial subtraction jcc adj_00 // Failed, add back // xx01xx: eorl d3,d0 // d0 = 2a + b (xx0110^011 -> xx0101) asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder lsrl #1,d3 // Advance to next lower bit position jcc trial_sub // Final few bits require a bit more work: ffb_1: movel #0x80000000,d3 subl d0,d1 // Trial subtraction jcs ntntl_0 addl #1,d0 // Set bit 2 of result jra ntl_s ntntl_1: subl #1,d0 // Leave bit 2 of result set ntl_s: asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder subl d3,d2 subxl d0,d1 jcs ntl_0 addl #1,d0 // Set bit 1 of result ntl_1: lsrl #1,d3 asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder subl d3,d2 subxl d0,d1 scc d3 // Complement X lsrl #1,d3 jra lsb // xx00xx: adj_00: subl d3,d0 // d0 = 2a + 3b (xx0110-011 -> xx0011 asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder lsrl #1,d3 // Advance to next lower bit position jcc rstor_sub // Final few bits require a bit more work: ffb_0: movel #0x80000000,d3 addl d0,d1 // Restore & trial subtraction jcs ntntl_1 subl #2,d0 // Clear bit 2 of result ntntl_0: asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder addl d3,d2 // Next to last add back/subtract addxl d0,d1 jcs ntl_1 subl #1,d0 // Clear bit 1 of result ntl_0: asrl #1,d3 asll #1,d2 // Pick up another bit addxl d1,d1 // of the remainder addl d3,d2 addxl d0,d1 lsb: addxl d0,d0 // Shift lsb of result in from X exit: movel (sp)+,d3 movel (sp)+,d2 rts