#include "asm.h"

/*
** divu_t divu64_1(unsigned dh, unsigned dl, unsigned dv);
**
** This routine is the standard binary divide, with several improvements.
**
** 1)	The divisor is right-shifted one place. This means that we don't need
**	an extension word on the left. Shifting of the dividend is done after
**	the trial subtraction.
**
** 2)	Addition of the negated divisor is used instead of subtraction. This
**	allows us to use the carry bit directly to produce the next quotient
**	bit, and simplifies doing #3.
**
** 3)	Instead of adding back the divisor for zero quotient bits, we wait
**	until the next iteration. Then we do an add, instead of a subtract.
**	As a final step, the remainder may need to be adjusted. Two
**	interlocking loops are used. One for the subracts, and one for the
**	adds.
*/
	.text
	.global	divu64_1
divu64_1:
	lea	-16(sp), sp
	moveml	d2-d5, (sp)

	movel	28(sp), d2	// Get divisor
	movel	24(sp), d0	// Lower word of dividend
	movel	20(sp), d1	// Upper word of dividend
	cmpl	d2, d1		// Overflow? Divide by zero?
	bccb	ovflow

	movel	d2, d4
	lsrl	#1, d2		// floor(divisor/2)
	movel	d2, d3
	subl	d4, d3		// floor(-divisor/2)
	movel	#31, d5		// Get bitcount-1
	lsll	d5, d4		// Use count to shift low bit into place

sub_lp:	addl	d4, d0		// Trial subtraction, low bit
	addxl	d3, d1		// Trial subtract
	bccb	clr_q		// Success?
set_q:	addxl	d0, d0		// Yep, set quotient bit, shift dividend
	addxl	d1, d1		// Shift dividend, remainder
	subl	#1, d5		// Done?
	bplb	sub_lp		// Nope, go do another
	brab	done

add_lp:	addl	d4, d0
	addxl	d2, d1		// Add back, trial subtract
	bcsb	set_q		// Success?
clr_q:	addxl	d0, d0		// Nope, clear quotient bit, shift dividend
	addxl	d1, d1		// Shift dividend, remainder
	subl	#1, d5		// Done?
	bplb	add_lp		// Nope, go do another

	addl	d4, d4
	addxl	d2, d2		// Recover divisor
	addl	d2, d1		// and correct remainder

done:	moveml	(sp), d2-d5
	lea	16(sp), sp
	rts

ovflow:	movel	#-1, d0		// Return all ones
	movel	d0, d1		// and impossible remainder
	brab	done