/* --- Copyright University of Sussex 1994. All rights reserved. ----------
 * File:            S.axpvms/src/aarith.s
 * Purpose:
 * Author:          John Gibson, Aug 31 1994
 */

	.title	"aarith.o"	;;; must be the object file name

;;; ------------------- ARITHMETIC ROUTINES ------------------------------

#_<

#_INCLUDE 'asm.ph'

constant
	procedure $-Sys$-Array$-Sub_error
	;

lconstant macro (
	_PD_ARRAY_TABLE		= @@PD_ARRAY_TABLE,
	_PD_ARRAY_VECTOR	= @@PD_ARRAY_VECTOR,
	_PD_ARRAY_SUBSCR_PDR	= @@PD_ARRAY_SUBSCR_PDR,
	);


>_#


ASM_CODE_PSECT


;;; --- LOGICAL BIT ROUTINES ------------------------------------------------

	;;; _biset(______int1, ______int2) -> ______int3  (logical or)

	.align quad
DEF_C_LAB 4 (_biset)
	ldl	rt1, (rusp)
	ldl	rt0, 4(rusp)
	lda	rusp, 4(rusp)
	or	rt0, rt1, rt0
	stl	rt0, (rusp)
	ret	(rret)


	;;; _biclear(______int1, ______int2) -> ______int3  (logical and not)

	.align quad
DEF_C_LAB 4 (_biclear)
	ldl	rt1, (rusp)
	ldl	rt0, 4(rusp)
	lda	rusp, 4(rusp)
	bic	rt0, rt1, rt0
	stl	rt0, (rusp)
	ret	(rret)


	;;; _bimask(______int1, ______int2) -> ______int3  (logical and)

	.align quad
DEF_C_LAB 4 (_bimask)
	ldl	rt1, (rusp)
	ldl	rt0, 4(rusp)
	lda	rusp, 4(rusp)
	and	rt0, rt1, rt0
	stl	rt0, (rusp)
	ret	(rret)


	;;; _bixor(______int1, ______int2) -> ______int3  (logical exclusive or)

	.align quad
DEF_C_LAB 4 (_bixor)
	ldl	rt1, (rusp)
	ldl	rt0, 4(rusp)
	lda	rusp, 4(rusp)
	xor	rt0, rt1, rt0
	stl	rt0, (rusp)
	ret	(rret)


;;; --- MULTIPLY -----------------------------------------------------------

	;;; _pmult(____int1, ____int2) -> _______product
	;;; pop integer multiply
	.align quad
DEF_C_LAB 4 (_pmult)
	ldl	rt1, (rusp)		;;; ____int2
	ldl	rt0, 4(rusp)		;;; ____int1
	sra	rt1, #2, rt1		;;; make ____int2 sysint
	bic	rt0, #3, rt0		;;; clear tag bits on other
	mull	rt0, rt1, rt0		;;; do the multiply
	lda	rusp, 4(rusp)
	bis	rt0, #3, rt0		;;; set tag bits on result
	stl	rt0, (rusp)		;;; return it
	ret	(rret)

	;;; _pmult_testovf(____int1, ____int2) -> (_______product, ____bool)
	;;; ____bool true if no overflow, false if overflow
	.align quad
DEF_C_LAB (_pmult_testovf)
	ldl	rt1, (rusp)		;;; ____int2
	ldl	rt0, 4(rusp)		;;; ____int1
	sra	rt1, #2, rt1		;;; make ____int2 sysint
	bic	rt0, #3, rt0		;;; clear tag bits on other
	mulq	rt0, rt1, rt0		;;; do 64-bit multiply
	lda	rt2, _TRUEOFFS(rfalse)	;;; get true for ____bool result
	sra	rt0, #31, rt1		;;; sign of 32-bit result sb 0 or -1
	bis	rt0, #3, rt0		;;; set tag bits on result
	addq	rt1, #1, rt1		;;; sign+1 should give 1 or 0 ...
	stl	rt0, 4(rusp)		;;; store _______product in any case
	bic	rt1, #1, rt1		;;; ... now 0
	cmovne	rt1, rfalse, rt2	;;; replace true with false if nonzero
	stl	rt2, (rusp)		;;; return it
	ret	(rret)


;;; --- DIVIDE -------------------------------------------------------------

	;;; _div(__________dividend, _________divisor) -> (___________remainder, __________quotient)
	;;; integer divide

	.align quad
DEF_C_LAB 4 (_div)
	ldl	rt0, 4(rusp)		;;; __________dividend
	ldl	rt1, (rusp)		;;; _________divisor
	bsr	rchain, quot_rem
	stl	rt0, 4(rusp)		;;; ___________remainder
	stl	rt1, (rusp)		;;; __________quotient
	ret	(rret)


	;;; _divq(__________dividend, _________divisor) -> __________quotient
	;;; integer divide, quotient only

	.align quad
DEF_C_LAB 2 (_divq)
	ldl	rt0, 4(rusp)		;;; __________dividend
	ldl	rt1, (rusp)		;;; _________divisor
	lda	rusp, 4(rusp)
	bsr	rchain, quot_rem
	stl	rt1, (rusp)		;;; __________quotient
	ret	(rret)


	;;; _pdiv(________dividend, _______divisor) -> (_________remainder, ________quotient)
	;;; pop integer divide

	.align quad
DEF_C_LAB 4 (_pdiv)
	ldl	rt0, 4(rusp)		;;; ________dividend
	ldl	rt1, (rusp)		;;; _______divisor
	sra	rt0, #2, rt0		;;; ________dividend -> sysint
	sra	rt1, #2, rt1		;;; _______divisor -> sysint
	bsr	rchain, quot_rem
	s4addl	rt0, #3, rt0		;;; ___________remainder -> popint
	s4addl	rt1, #3, rt1		;;; __________quotient -> popint
	stl	rt0, 4(rusp)		;;; _________remainder
	stl	rt1, (rusp)		;;; ________quotient
	ret	(rret)


;;; ------------------------------------------------------------------------

	;;; _pint_testovf(_____int) -> (____pint, true)  if okay
	;;;			-> false         if overflow
	;;; convert int to popint, with overflow test

	.align quad
DEF_C_LAB (_pint_testovf)
	ldl	rt0, (rusp)		;;; _____int
	lda	rt3, _TRUEOFFS(rfalse)	;;; anticipate true needed
	sll	rt0, #32+2, rt2		;;; do popint shift in top 32 bits
	s4addl	rt0, #3, rt1		;;; get ____pint result in any case
	sra	rt2, #32+2, rt2		;;; reverse test shift
	cmpeq	rt2, rt0, rt2		;;; overflow if it changed
	blbc	rt2, 1$			;;; br if so
	lda	rusp, -4(rusp)		;;; push stack
	stl	rt1, 4(rusp)		;;; return ____pint
	stl	rt3, (rusp)		;;; and true
	ret	(rret)
1$:	stl	rfalse, (rusp)		;;; false if overflow
	ret	(rret)


;;; ------------------------------------------------------------------------

	;;; _pshift_testovf(_____pint1, _______nbits) -> (_____pint2, true)  if okay
	;;;				   -> false          if overflow
	;;; shift popint left, with overflow test

	.align quad
DEF_C_LAB (_pshift_testovf)
	ldl	rt0, 4(rusp)		;;; _____pint1
	ldl	rt1, (rusp)		;;; _______nbits
	lda	rt5, _TRUEOFFS(rfalse)	;;; anticipate true needed
	bic	rt0, #3, rt0		;;; clear tag bits on _____pint1
	addl	rt1, #32, rt2		;;; do a test shift in top 32 bits
	bic	rt2, #63, rt3		;;; shift now > 63 ?
	bne	rt3, 2$			;;; br if so
	sll	rt0, rt2, rt3		;;; do the test shift ...
	sll	rt0, rt1, rt4		;;; and the proper shift in any case
	sra	rt3, rt2, rt2		;;; ... and test shift back again
	bis	rt4, #3, rt4		;;; set tag bits on proper result
	cmpeq	rt2, rt0, rt1		;;; overflow if it changed
	blbc	rt1, 3$			;;; br if so
	stl	rt4, 4(rusp)		;;; return _____pint2
1$:	stl	rt5, (rusp)		;;; and true
	ret	(rret)
	;;; shift too big -- overflow unless _____pint1 zero
2$:	beq	rt0, 1$			;;; ok, leave _____pint1 and return true
	;;; overflow -- return false
3$:	stl	rfalse, 4(rusp)		;;; false if overflow
	lda	rusp, 4(rusp)
	ret	(rret)


;;; --- BIGINTEGER ARITHMETIC --------------------------------------------

	;;; _emul(______________multiplicand, ____________multiplier) -> (____lo, ____hi)
	;;; multiply two slices to get double slice result

	.align quad
DEF_C_LAB (_emul)
DEF_C_LAB (_posword_emul)
	ldl	rt0, 4(rusp)		;;; ______________multiplicand
	ldl	rt1, (rusp)		;;; ____________multiplier
	mulq	rt0, rt1, rt0		;;; do the multiply
	ldah	rt2, -^X8000(rzero)	;;; ones in bits 31-63
	sra	rt0, #31, rt1		;;; ____hi result
	bic	rt0, rt2, rt0		;;; clear hi part = ____lo result
	stl	rt1, (rusp)		;;; return ____hi
	stl	rt0, 4(rusp)		;;; return ____lo
	ret	(rret)


	;;; _ediv(____hi, ____lo, _________divisor) -> (___________remainder, __________quotient)
	;;; divide double slice dividend by single

	.align quad
DEF_C_LAB (_ediv)
	ldl	rt0, 8(rusp)		;;; ____hi
	ldl	rt2, 4(rusp)		;;; ____lo
	ldl	rt1, (rusp)		;;; _________divisor
	lda	rusp, 4(rusp)
	bsr	rchain, ediv		;;; quot in rt1, rem in rt0
	stl	rt0, 4(rusp)		;;; return ___________remainder
	stl	rt1, (rusp)		;;; return __________quotient
	ret	(rret)


	;;; _bgi_mult(_____val, _______saddr, ______slim, _______daddr) -> (_______carry, __________nextdest)
	;;; multiply a biginteger by a signed value into a destination bigint

	.align quad
DEF_C_LAB (_bgi_mult)
	ldl	rt0, 12(rusp)		;;; multiplier _____val
	ldl	rt1, 8(rusp)		;;; source addr _______saddr
	ldl	rt2, 4(rusp)		;;; source lim ______slim
	ldl	rt3, (rusp)		;;; destination addr _______daddr
	lda	rusp, 8(rusp)
	clr	rt4			;;; zero carry slice
	ldah	rt6, -^X8000(rzero)	;;; ones in bits 31-63

1$:	ldl	rt5, (rt1)		;;; next source slice in rt5
	lda	rt1, 4(rt1)		;;; step source
	mulq	rt5, rt0, rt5		;;; multiply source by _____val
	lda	rt3, 4(rt3)		;;; step destination
	addq	rt5, rt4, rt5		;;; add last carry
	sra	rt5, #31, rt4		;;; hi part is next carry
	bic	rt5, rt6, rt5		;;; clear hi part to get lo part
	stl	rt5, -4(rt3)		;;; store lo at last destination
	cmpult	rt1, rt2, rt5		;;; compare source addr with lim
	blbs	rt5, 1$			;;; next source slice if more

	stl	rt4, 4(rusp)		;;; return next carry slice
	stl	rt3, (rusp)		;;; and next destination
	ret	(rret)


	;;; _bgi_mult_add(_____val, _______saddr, ______slim, ________sdaddr) -> (_______carry, __________nextdest)
	;;; multiply a biginteger by a signed value
	;;; and add into a destination bigint

	.align quad
DEF_C_LAB (_bgi_mult_add)
	ldl	rt0, 12(rusp)		;;; multiplier _____val
	ldl	rt1, 8(rusp)		;;; source addr _______saddr
	ldl	rt2, 4(rusp)		;;; source lim ______slim
	ldl	rt3, (rusp)		;;; destination addr _______daddr
	lda	rusp, 8(rusp)
	clr	rt4			;;; zero carry slice
	ldah	rt6, -^X8000(rzero)	;;; ones in bits 31-63

1$:	ldl	rt5, (rt1)		;;; next source slice into rt5
	lda	rt1, 4(rt1)		;;; step source
	mulq	rt5, rt0, rt5		;;; multiply source by _____val
	ldl	rchain, (rt3)		;;; next destination slice
	lda	rt3, 4(rt3)		;;; step destination
	addq	rt4, rchain, rt4	;;; add dest slice to last carry
	addq	rt5, rt4, rt5		;;; add last carry to product
	sra	rt5, #31, rt4		;;; hi part is next carry
	bic	rt5, rt6, rt5		;;; clear hi part to get lo part
	stl	rt5, -4(rt3)		;;; store lo at last destination
	cmpult	rt1, rt2, rt5		;;; compare source addr with lim
	blbs	rt5, 1$			;;; next source slice if more

	stl	rt4, 4(rusp)		;;; return next carry slice
	stl	rt3, (rusp)		;;; and next destination
	ret	(rret)


	;;; _bgi_div(_________divisor, _______saddr, ______slim, ______dlim) -> ___________remainder
	;;; divide a biginteger by a signed value into a destination bigint

	.align quad
DEF_C_LAB (_bgi_div)
	stl	rnpl0, -4(rusp)		;;; save local regs
	stl	rnpl1, -8(rusp)
	stl	rnpl2, -12(rusp)
	stl	rnpl3, -16(rusp)

	ldl	rnpl0, 12(rusp)		;;; _________divisor
	ldl	rnpl1, 8(rusp)		;;; source addr _______saddr
	ldl	rnpl2, 4(rusp)		;;; source lim ______slim
	ldl	rnpl3, (rusp)		;;; destination lim ______dlim

	;;; do first slice with quot_rem
	ldl	rt0, -4(rnpl2)		;;; most sig slice (signed) into rt0
	mov	rnpl0, rt1		;;; divisor into rt1
	bsr	rchain, quot_rem	;;; quot in rt1, rem in rt0
	br	2$

	;;; loop for rest done with ediv
1$:	ldl	rt2, -4(rnpl2)		;;; next (+ve) slice into rt2 is lo
	mov	rnpl0, rt1		;;; divisor into rt1
	bsr	rchain, ediv		;;; quot in rt1, rem in rt0
2$:	stl	rt1, -4(rnpl3)		;;; store quot at next dest (can be -ve)
	lda	rnpl2, -4(rnpl2)	;;; step back source
	lda	rnpl3, -4(rnpl3)	;;; step back destination
	cmpule	rnpl2, rnpl1, rt3	;;; reached source start?
	blbc	rt3, 1$			;;; loop if not

	ldl	rnpl0, -4(rusp)		;;; restore local regs
	ldl	rnpl1, -8(rusp)
	ldl	rnpl2, -12(rusp)
	ldl	rnpl3, -16(rusp)
	lda	rusp, 12(rusp)		;;; remove args but 1
	stl	rt0, (rusp)		;;; return remainder
	ret	(rret)


;;; --- COMPUTE ARRAY SUBSCRIPTS -----------------------------------------

	;;; _array_sub()
	;;; compute an array total subscript -- called inside an array
	;;; procedure

	.align quad
DEF_C_LAB (_array_sub)
	lda	rt5, _PD_ARRAY_TABLE+4(rpb)	;;; start of parameters + 4
	ldl	rt4, -4(rt5)	;;; init total subscript to base subscript
	ldl	rt3, (rt5)	;;; length in first dimension
	br	3$

1$:	ldl	rt2, (rusp)	;;; next dimension subscript from stack
	ldl	rt1, 4(rt5)	;;; dimension lower bound
	and	rt2, #2, rt0	;;; subscript is popint?
	subl	rt2, rt1, rt2	;;; subtract lower bound from subscript
	beq	rt0, 4$		;;; error if subscript not popint
	cmpult	rt2, rt3, rt0	;;; compare with length
	ldl	rt1, 8(rt5)	;;; dimension scaling factor
	beq	rt0, 4$		;;; error if subscript >= length unsigned
	lda	rusp, 4(rusp)	;;; inc stack
	beq	rt1, 2$		;;; no multiply needed if scale factor 0
	mull	rt2, rt1, rt2	;;; multiply by scale factor
2$:	lda	rt5, 12(rt5)	;;; step to next dimension params
	ldl	rt3, (rt5)	;;; length in next dimension
	addl	rt4, rt2, rt4	;;; add scaled subscript to total
3$:	bne	rt3, 1$		;;; loop if nonzero

	;;; finished -- stack total subscript and arrayvector, and then
	;;; chain subscripting procedure
	ldl	rt0, _PD_ARRAY_VECTOR(rpb)
	stl	rt4, -4(rusp)	;;; subscript
	stl	rt0, -8(rusp)	;;; arrayvector
	ldl	rpb, _PD_ARRAY_SUBSCR_PDR(rpb)
	ldl	rt0, (rpb)
	lda	rusp, -8(rusp)	;;; dec stack
	jmp	(rt0)		;;; chain subscr procedure

	;;; subscript error (bad subscript still tos) -- call error procedure
4$:	ldl	rpb, _SVB_OFFS(Sys$-Array$-Sub_error)(rsvb)
	ldl	rt0, (rpb)
	jmp	(rt0)		;;; chain Sub_error


;;; --- 32-BIT BY 32-BIT SIGNED DIVISION ROUTINE --------------------------
;;; Takes dividend in rt0, divisor in rt1.
;;; Produces quotient in rt1, remainder in rt0.
;;; Uses rt0 - rt5. Return in rchain.

	.align quad
quot_rem:
	clr	rt2			;;; clear sign indicators
	bgt	rt1, 1$			;;; br if divisor +ve
	negl	rt1, rt1		;;; negate it
	bis	rt2, #1, rt2		;;; set negate quot at end
	bgt	rt1, 1$			;;; br if divisor now +ve
	beq	rt1, div_zero		;;; br if dividing by zero
	;;; largest -ve divisor
	cmpeq	rt1, rt0, rt1		;;; quot=1 if dividend same, 0 if not
	cmovlbs	rt1, #0, rt0		;;; zero rem if quot=1
	ret	(rchain)

1$:	bge	rt0, 2$			;;; br if dividend >= 0
	negq	rt0, rt0		;;; negq allows greatest -ve dividend
	xor	rt2, #1, rt2		;;; invert negate quot at end
	subl	rt2, #2, rt2		;;; set negate rem at end

	;;; establish amount to shift quotient up to align top with dividend
2$:	clr	rt3			;;; zero shift count

#_<
lvars n;
for n from 4 by -1 to 0 do
[
\t  addl    \t	rt3, #%_int(1<<n)%, rt4	\n
\t  srl     \t	rt0, rt4, rt5	\n	;;; dividend >> tmp
\t  cmplt   \t	rt5, rt1, rt5	\n	;;; test that less than divisor
\t  cmovlbc \t	rt5, rt4, rt3	\n	;;; tmp -> shift count if not
].dl
endfor
>_#

	sll	rt1, #32, rt1		;;; divisor to top half of reg
	sll	rt0, #32, rt0		;;; dividend to top half of reg
	subq	rt1, #1, rt1		;;; quot bit in low half when subtracted
	sll	rt1, rt3, rt1		;;; shift divisor up by shift count
	;;; switch on (32 - shift count) * 3
	mov	#32, rt5
	subl	rt5, rt3, rt3		;;; J = 32 - shift
	addl	rt3, rt3, rt5		;;; J*2
    .begin_exact
	br	rt4, 3$			;;; get 3$ -- MUST BE 3 INSTRS AFTER
3$:	addl	rt3, rt5, rt3		;;; J*3
	s4addl	rt3, rt4, rt3		;;; *4 and add to 3$
	jmp	(rt3)

#_<
repeat 31 times
[
\t  subq   \t	rt0, rt1, rt3	\n	;;; sub divsr from rem, add in quot bit
\t  srl    \t	rt1, # _1, rt1	\n	;;; shift divisor/quot bit down for next
\t  cmovge \t	rt3, rt3, rt0	\n	;;; replace rem for next if not neg
].dl
endrepeat,
>_#
	subq	rt0, rt1, rt3
	cmovge	rt3, rt3, rt0
    .end_exact

	extll	rt0, #0, rt1		;;; quotient
	srl	rt0, #32, rt0		;;; remainder
	negl	rt1, rt4
	negl	rt0, rt3
	cmovlbs	rt2, rt4, rt1
	cmovlt	rt2, rt3, rt0
	ret	(rchain)

div_zero:
	mov	#-2, r16		;;; SS$_INTDIV
	call_pal gentrap
	ret	(rchain)


;;; --- BIGINT DOUBLE-SLICE BY SLICE DIVIDE --------------------------------
;;; Takes dividend hi slice in rt0, lo slice in rt2, divisor in rt1.
;;; Produces quotient in rt1, remainder in rt0.
;;; Uses rt0 - rt6. Return in rchain.

	.align quad
ediv:
	clr	rt6			;;; clear sign indicators

	;;; get divisor +ve
	bgt	rt1, 2$			;;; br if divisor +ve
	negl	rt1, rt1		;;; else negate it
	bis	rt6, #1, rt6		;;; set negate quot at end
	bgt	rt1, 2$			;;; br if divisor now +ve
	;;; largest -ve divisor (- 2**31)
	mov	rt1, rt3		;;; save it
	negl	rt0, rt1		;;; quot is hi part negated
	mov	rt2, rt0		;;; rem is lo part
	ble	rt1, 1$			;;; br if hi was >= 0 ...
	beq	rt0, 1$			;;; ... or lo was 0
	addl	rt1, #1, rt1		;;; else need neg rem - incr quot by 1
	addl	rt0, rt3, rt0		;;; and add (-ve) divisor to rem
1$: 	ret	(rchain)

	;;; get dividend +ve
2$:	sll	rt0, #31, rt0		;;; combine dividend hi and lo slices
	or	rt0, rt2, rt0

	bge	rt0, 3$			;;; br if dividend >= 0
	negq	rt0, rt0		;;; else negate it
	xor	rt6, #1, rt6		;;; invert negate quot at end
	subl	rt6, #2, rt6		;;; set negate rem at end

	;;; establish amount to shift quotient up to align top with dividend
3$:	clr	rt3			;;; zero shift count

#_<
lvars n;
for n from 4 by -1 to 0 do
[
\t  addl    \t	rt3, #%_int(1<<n)%, rt4	\n	;;; shift count + 2**N  -> tmp
\t  srl     \t	rt0, rt4, rt5	\n	;;; dividend >> tmp
\t  cmplt   \t	rt5, rt1, rt5	\n	;;; test that less than divisor
\t  cmovlbc \t	rt5, rt4, rt3	\n	;;; tmp -> shift count if not
].dl
endfor
>_#

	sll	rt1, rt3, rt1		;;; shift divisor up by shift count
	mov	#1, rt4			;;; bit for quotient
	sll	rt4, rt3, rt4		;;; shift up by shift count
	;;; switch on (32 - shift count) * 6
	mov	#32, rt5
	subl	rt5, rt3, rt3		;;; J = 32 - shift
    .begin_exact
	br	rt2, 4$			;;; get 4$ -- MUST BE 6 INSTRS AFTER
4$:	addl	rt3, rt3, rt5		;;; J*2
	addl	rt3, rt5, rt3		;;; J*3
	addl	rt3, rt3, rt3		;;; J*6
	s4addl	rt3, rt2, rt5		;;; *4 and add to 4$
	clr	rt3			;;; clear quotient
	jmp	(rt5)

#_<
repeat 31 times
[
\t  subq   \t	rt0, rt1, rt5	\n	;;; sub divisor from dividend -> tmp1
\t  addl   \t	rt3, rt4, rt2	\n	;;; add quotient bit -> tmp2
\t  srl    \t	rt1, # _1, rt1	\n	;;; shift divisor for next
\t  srl    \t	rt4, # _1, rt4	\n	;;; shift quotient bit for next
\t  cmovge \t	rt5, rt5, rt0	\n	;;; tmp1 -> dividend if not neg
\t  cmovge \t	rt5, rt2, rt3	\n	;;; tmp2 -> quotient if not neg
].dl
endrepeat,
>_#
	subq	rt0, rt1, rt5
	addl	rt3, rt4, rt2
	cmovge	rt5, rt5, rt0
	cmovge	rt5, rt2, rt3
    .end_exact

	mov	rt3, rt1		;;; quotient
	negl	rt1, rt4
	negl	rt0, rt3
	cmovlbs	rt6, rt4, rt1
	cmovlt	rt6, rt3, rt0
	ret	(rchain)


	.end
