hppa20.s - mozsearch

mozilla-central/security/nss/lib/freebl/mpi/hppa20.s

Enable keyboard shortcuts

Source code

File a bug in NSS :: Libraries

Revision control

Copy as Markdown

Other Tools

; This Source Code Form is subject to the terms of the Mozilla Public

; License, v. 2.0. If a copy of the MPL was not distributed with this

; file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifdef __LP64__

        .LEVEL   2.0W

#else

;       .LEVEL   1.1

;       .ALLOW   2.0N

        .LEVEL   2.0

#endif

        .SPACE   $TEXT$,SORT=8

        .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24

; ***************************************************************

;                 maxpy_[little/big]

; ***************************************************************

; There is no default -- you must specify one or the other.

#define LITTLE_WORDIAN 1

#ifdef LITTLE_WORDIAN

#define EIGHT 8

#define SIXTEEN 16

#define THIRTY_TWO 32

#define UN_EIGHT -8

#define UN_SIXTEEN -16

#define UN_TWENTY_FOUR -24

#endif

#ifdef BIG_WORDIAN

#define EIGHT -8

#define SIXTEEN -16

#define THIRTY_TWO -32

#define UN_EIGHT 8

#define UN_SIXTEEN 16

#define UN_TWENTY_FOUR 24

#endif

; This performs a multiple-precision integer version of "daxpy",

; Using the selected addressing direction.  "Little-wordian" means that

; the least significant word of a number is stored at the lowest address.

; "Big-wordian" means that the most significant word is at the lowest

; address.  Either way, the incoming address of the vector is that

; of the least significant word.  That means that, for little-wordian

; addressing, we move the address upward as we propagate carries

; from the least significant word to the most significant.  For

; big-wordian we move the address downward.

; We use the following registers:

;     r2   return PC, of course

;     r26 = arg1 =  length

;     r25 = arg2 =  address of scalar

;     r24 = arg3 =  multiplicand vector

;     r23 = arg4 =  result vector

;     fr9 = scalar loaded once only from r25

; The cycle counts shown in the bodies below are simply the result of a

; scheduling by hand.  The actual PCX-U hardware does it differently.

; The intention is that the overall speed is the same.

; The pipeline startup and shutdown code is constructed in the usual way,

; by taking the loop bodies and removing unnecessary instructions.

; We have left the comments describing cycle numbers in the code.

; These are intended for reference when comparing with the main loop,

; and have no particular relationship to actual cycle numbers.

#ifdef LITTLE_WORDIAN

maxpy_little

#else

maxpy_big

#endif

        .PROC

        .CALLINFO FRAME=120,ENTRY_GR=4

        .ENTRY

        STW,MA  %r3,128(%sp)

        STW     %r4,-124(%sp)

        ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.

        FLDD    0(%r25),%fr9        ; fr9 = scalar

; First startup

        FLDD    0(%r24),%fr24       ; Cycle 1

        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

        CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3

        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6

        FLDD    EIGHT(%r24),%fr28   ; Cycle 8

        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

        FSTD    %fr24,-96(%sp)

        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

        FSTD    %fr25,-80(%sp)

        LDO     SIXTEEN(%r24),%r24  ; Cycle 12

        FSTD    %fr31,-64(%sp)

        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

        FSTD    %fr27,-48(%sp)

; Second startup

        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

        FSTD    %fr30,-56(%sp)

        FLDD    0(%r24),%fr24

        FSTD    %fr26,-88(%sp)      ; Cycle 2

        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

        FSTD    %fr28,-104(%sp)

        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

        LDD     -96(%sp),%r3

        FSTD    %fr29,-72(%sp)

        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

        LDD     -64(%sp),%r19

        LDD     -80(%sp),%r21

        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6

        LDD     -56(%sp),%r20

        ADD     %r21,%r3,%r3

        ADD,DC  %r20,%r19,%r19      ; Cycle 7

        LDD     -88(%sp),%r4

        SHRPD   %r3,%r0,32,%r21

        LDD     -48(%sp),%r1

        FLDD    EIGHT(%r24),%fr28   ; Cycle 8

        LDD     -104(%sp),%r31

        ADD,DC  %r0,%r0,%r20

        SHRPD   %r19,%r3,32,%r3

        LDD     -72(%sp),%r29       ; Cycle 9

        SHRPD   %r20,%r19,32,%r20

        ADD     %r21,%r1,%r1

        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

        ADD,DC  %r3,%r4,%r4

        FSTD    %fr24,-96(%sp)

        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

        ADD,DC  %r0,%r20,%r20

        LDD     0(%r23),%r3

        FSTD    %fr25,-80(%sp)

        LDO     SIXTEEN(%r24),%r24  ; Cycle 12

        FSTD    %fr31,-64(%sp)

        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

        ADD     %r0,%r0,%r0         ; clear the carry bit

        ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12

        FSTD    %fr27,-48(%sp)

;        MFCTL   %cr16,%r21         ; for timing

;        STD     %r21,-112(%sp)

; Here is the loop.

$LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

        ADD,DC  %r29,%r4,%r4

        FSTD    %fr30,-56(%sp)

        FLDD    0(%r24),%fr24

        LDO     SIXTEEN(%r23),%r23  ; Cycle 2

        ADD,DC  %r0,%r20,%r20

        FSTD    %fr26,-88(%sp)

        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

        ADD     %r3,%r1,%r1

        FSTD    %fr28,-104(%sp)

        LDD     UN_EIGHT(%r23),%r21

        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

        ADD,DC  %r21,%r4,%r28

        FSTD    %fr29,-72(%sp)

        LDD     -96(%sp),%r3

        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

        ADD,DC  %r20,%r31,%r22

        LDD     -64(%sp),%r19

        LDD     -80(%sp),%r21

        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6

        ADD     %r21,%r3,%r3

        LDD     -56(%sp),%r20

        STD     %r1,UN_SIXTEEN(%r23)

        ADD,DC  %r20,%r19,%r19      ; Cycle 7

        SHRPD   %r3,%r0,32,%r21

        LDD     -88(%sp),%r4

        LDD     -48(%sp),%r1

        ADD,DC  %r0,%r0,%r20        ; Cycle 8

        SHRPD   %r19,%r3,32,%r3

        FLDD    EIGHT(%r24),%fr28

        LDD     -104(%sp),%r31

        SHRPD   %r20,%r19,32,%r20   ; Cycle 9

        ADD     %r21,%r1,%r1

        STD     %r28,UN_EIGHT(%r23)

        LDD     -72(%sp),%r29

        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

        ADD,DC  %r3,%r4,%r4

        FSTD    %fr24,-96(%sp)

        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

        ADD,DC  %r0,%r20,%r20

        FSTD    %fr25,-80(%sp)

        LDD     0(%r23),%r3

        LDO     SIXTEEN(%r24),%r24  ; Cycle 12

        FSTD    %fr31,-64(%sp)

        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

        ADD     %r22,%r1,%r1

        ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12

        FSTD    %fr27,-48(%sp)

$ENDLOOP

; Shutdown code, first stage.

;        MFCTL   %cr16,%r21         ; for timing

;        STD     %r21,UN_SIXTEEN(%r23)

;        LDD     -112(%sp),%r21

;        STD     %r21,UN_EIGHT(%r23)

        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

        ADD,DC  %r29,%r4,%r4

        CMPIB,= 0,%r26,$ONEMORE

        FSTD    %fr30,-56(%sp)

        LDO     SIXTEEN(%r23),%r23  ; Cycle 2

        ADD,DC  %r0,%r20,%r20

        FSTD    %fr26,-88(%sp)

        ADD     %r3,%r1,%r1         ; Cycle 3

        FSTD    %fr28,-104(%sp)

        LDD     UN_EIGHT(%r23),%r21

        ADD,DC  %r21,%r4,%r28       ; Cycle 4

        FSTD    %fr29,-72(%sp)

        STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9

        LDD     -96(%sp),%r3

        ADD,DC  %r20,%r31,%r22      ; Cycle 5

        STD     %r1,UN_SIXTEEN(%r23)

$JOIN4

        LDD     -64(%sp),%r19

        LDD     -80(%sp),%r21

        ADD     %r21,%r3,%r3        ; Cycle 6

        LDD     -56(%sp),%r20

        ADD,DC  %r20,%r19,%r19      ; Cycle 7

        SHRPD   %r3,%r0,32,%r21

        LDD     -88(%sp),%r4

        LDD     -48(%sp),%r1

        ADD,DC  %r0,%r0,%r20        ; Cycle 8

        SHRPD   %r19,%r3,32,%r3

        LDD     -104(%sp),%r31

        SHRPD   %r20,%r19,32,%r20   ; Cycle 9

        ADD     %r21,%r1,%r1

        LDD     -72(%sp),%r29

        ADD,DC  %r3,%r4,%r4         ; Cycle 10

        ADD,DC  %r0,%r20,%r20       ; Cycle 11

        LDD     0(%r23),%r3

        ADD     %r22,%r1,%r1        ; Cycle 13

; Shutdown code, second stage.

        ADD,DC  %r29,%r4,%r4        ; Cycle 1

        LDO     SIXTEEN(%r23),%r23  ; Cycle 2

        ADD,DC  %r0,%r20,%r20

        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3

        ADD     %r3,%r1,%r1

        ADD,DC  %r21,%r4,%r28       ; Cycle 4

        ADD,DC  %r20,%r31,%r22      ; Cycle 5

        STD     %r1,UN_SIXTEEN(%r23); Cycle 6

        STD     %r28,UN_EIGHT(%r23) ; Cycle 9

        LDD     0(%r23),%r3         ; Cycle 11

; Shutdown code, third stage.

        LDO     SIXTEEN(%r23),%r23

        ADD     %r3,%r22,%r1

$JOIN1  ADD,DC  %r0,%r0,%r21

        CMPIB,*= 0,%r21,$L0         ; if no overflow, exit

        STD     %r1,UN_SIXTEEN(%r23)

; Final carry propagation

$FINAL1 LDO     EIGHT(%r23),%r23

        LDD     UN_SIXTEEN(%r23),%r21

        ADDI    1,%r21,%r21

        CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.

        STD     %r21,UN_SIXTEEN(%r23)

        B       $L0

NOP

; Here is the code that handles the difficult cases N=1, N=2, and N=3.

; We do the usual trick -- branch out of the startup code at appropriate

; points, and branch into the shutdown code.

$N_IS_SMALL

        CMPIB,= 0,%r26,$N_IS_ONE

        FSTD    %fr24,-96(%sp)      ; Cycle 10

        FLDD    EIGHT(%r24),%fr28   ; Cycle 8

        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

        FSTD    %fr25,-80(%sp)

        FSTD    %fr31,-64(%sp)      ; Cycle 12

        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

        FSTD    %fr27,-48(%sp)

        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

        CMPIB,= 2,%r26,$N_IS_THREE

        FSTD    %fr30,-56(%sp)

; N = 2

        FSTD    %fr26,-88(%sp)      ; Cycle 2

        FSTD    %fr28,-104(%sp)     ; Cycle 3

        LDD     -96(%sp),%r3        ; Cycle 4

        FSTD    %fr29,-72(%sp)

        B       $JOIN4

        ADD     %r0,%r0,%r22

$N_IS_THREE

        FLDD    SIXTEEN(%r24),%fr24

        FSTD    %fr26,-88(%sp)      ; Cycle 2

        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

        FSTD    %fr28,-104(%sp)

        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

        LDD     -96(%sp),%r3

        FSTD    %fr29,-72(%sp)

        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

        LDD     -64(%sp),%r19

        LDD     -80(%sp),%r21

        B       $JOIN3

        ADD     %r0,%r0,%r22

$N_IS_ONE

        FSTD    %fr25,-80(%sp)

        FSTD    %fr27,-48(%sp)

        FSTD    %fr26,-88(%sp)      ; Cycle 2

        B       $JOIN5

        ADD     %r0,%r0,%r22

; We came out of the unrolled loop with wrong parity.  Do one more

; single cycle.  This is quite tricky, because of the way the

; carry chains and SHRPD chains have been chopped up.

$ONEMORE

        FLDD    0(%r24),%fr24

        LDO     SIXTEEN(%r23),%r23  ; Cycle 2

        ADD,DC  %r0,%r20,%r20

        FSTD    %fr26,-88(%sp)

        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

        FSTD    %fr28,-104(%sp)

        LDD     UN_EIGHT(%r23),%r21

        ADD     %r3,%r1,%r1

        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

        ADD,DC  %r21,%r4,%r28

        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9

        LDD     -96(%sp),%r3

        FSTD    %fr29,-72(%sp)

        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

        ADD,DC  %r20,%r31,%r22

        LDD     -64(%sp),%r19

        LDD     -80(%sp),%r21

        STD     %r1,UN_SIXTEEN(%r23); Cycle 6

$JOIN3

        XMPYU   %fr9L,%fr24R,%fr24

        LDD     -56(%sp),%r20

        ADD     %r21,%r3,%r3

        ADD,DC  %r20,%r19,%r19      ; Cycle 7

        LDD     -88(%sp),%r4

        SHRPD   %r3,%r0,32,%r21

        LDD     -48(%sp),%r1

        LDD     -104(%sp),%r31      ; Cycle 8

        ADD,DC  %r0,%r0,%r20

        SHRPD   %r19,%r3,32,%r3

        LDD     -72(%sp),%r29       ; Cycle 9

        SHRPD   %r20,%r19,32,%r20

        ADD     %r21,%r1,%r1

        ADD,DC  %r3,%r4,%r4         ; Cycle 10

        FSTD    %fr24,-96(%sp)

        ADD,DC  %r0,%r20,%r20       ; Cycle 11

        LDD     0(%r23),%r3

        FSTD    %fr25,-80(%sp)

        ADD     %r22,%r1,%r1        ; Cycle 13

        FSTD    %fr27,-48(%sp)

; Shutdown code, stage 1-1/2.

        ADD,DC  %r29,%r4,%r4        ; Cycle 1

        LDO     SIXTEEN(%r23),%r23  ; Cycle 2

        ADD,DC  %r0,%r20,%r20

        FSTD    %fr26,-88(%sp)

        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3

        ADD     %r3,%r1,%r1

        ADD,DC  %r21,%r4,%r28       ; Cycle 4

        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9

        ADD,DC  %r20,%r31,%r22      ; Cycle 5

        STD     %r1,UN_SIXTEEN(%r23)

$JOIN5

        LDD     -96(%sp),%r3        ; moved from cycle 4

        LDD     -80(%sp),%r21

        ADD     %r21,%r3,%r3        ; Cycle 6

        ADD,DC  %r0,%r0,%r19        ; Cycle 7

        LDD     -88(%sp),%r4

        SHRPD   %r3,%r0,32,%r21

        LDD     -48(%sp),%r1

        SHRPD   %r19,%r3,32,%r3     ; Cycle 8

        ADD     %r21,%r1,%r1        ; Cycle 9

        ADD,DC  %r3,%r4,%r4         ; Cycle 10

        LDD     0(%r23),%r3         ; Cycle 11

        ADD     %r22,%r1,%r1        ; Cycle 13

; Shutdown code, stage 2-1/2.

        ADD,DC  %r0,%r4,%r4         ; Cycle 1

        LDO     SIXTEEN(%r23),%r23  ; Cycle 2

        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3

        ADD     %r3,%r1,%r1

        STD     %r1,UN_SIXTEEN(%r23)

        ADD,DC  %r21,%r4,%r1

        B       $JOIN1

        LDO     EIGHT(%r23),%r23

; exit

$L0

        LDW     -124(%sp),%r4

        BVE     (%r2)

        .EXIT

        LDW,MB  -128(%sp),%r3

        .PROCEND

; ***************************************************************

;                 add_diag_[little/big]

; ***************************************************************

; The arguments are as follows:

;     r2   return PC, of course

;     r26 = arg1 =  length

;     r25 = arg2 =  vector to square

;     r24 = arg3 =  result vector

#ifdef LITTLE_WORDIAN

add_diag_little

#else

add_diag_big

#endif

        .PROC

        .CALLINFO FRAME=120,ENTRY_GR=4

        .ENTRY

        STW,MA  %r3,128(%sp)

        STW     %r4,-124(%sp)

        ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.

NOP

; Startup code

        FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)

        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4

        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5

        XMPYU   %fr7L,%fr7L,%fr30

        LDO     SIXTEEN(%r25),%r25  ; Cycle 6

        FSTD    %fr29,-88(%sp)

        FSTD    %fr27,-72(%sp)      ; Cycle 7

        CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)

        FSTD    %fr30,-96(%sp)

        FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2

        LDD     -88(%sp),%r22       ; Cycle 3

        LDD     -72(%sp),%r31       ; Cycle 4

        XMPYU   %fr7R,%fr7R,%fr28

        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5

        XMPYU   %fr7L,%fr7L,%fr31

        LDD     -96(%sp),%r20       ; Cycle 6

        FSTD    %fr28,-80(%sp)

        ADD     %r0,%r0,%r0         ; clear the carry bit

        ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7

        FSTD    %fr24,-64(%sp)

; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".

$DIAGLOOP

        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)

        LDO     SIXTEEN(%r25),%r25

        LDD     0(%r24),%r1

        FSTD    %fr31,-104(%sp)

        SHRPD   %r0,%r31,31,%r4     ; Cycle 2

        ADD,DC  %r22,%r3,%r3

        FLDD    UN_SIXTEEN(%r25),%fr7

        ADD,DC  %r0,%r20,%r20       ; Cycle 3

        ADD     %r1,%r3,%r3

        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4

        LDD     -80(%sp),%r21

        STD     %r3,0(%r24)

        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5

        XMPYU   %fr7L,%fr7L,%fr30

        LDD     -64(%sp),%r29

        LDD     EIGHT(%r24),%r1

        ADD,DC  %r4,%r20,%r20       ; Cycle 6

        LDD     -104(%sp),%r19

        FSTD    %fr29,-88(%sp)

        ADD     %r20,%r1,%r1        ; Cycle 7

        FSTD    %fr27,-72(%sp)

        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)

        LDO     THIRTY_TWO(%r24),%r24

        LDD     UN_SIXTEEN(%r24),%r28

        FSTD    %fr30,-96(%sp)

        SHRPD   %r0,%r29,31,%r3     ; Cycle 2

        ADD,DC  %r21,%r4,%r4

        FLDD    UN_EIGHT(%r25),%fr7

        STD     %r1,UN_TWENTY_FOUR(%r24)

        ADD,DC  %r0,%r19,%r19       ; Cycle 3

        ADD     %r28,%r4,%r4

        XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4

        LDD     -88(%sp),%r22

        STD     %r4,UN_SIXTEEN(%r24)

        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5

        XMPYU   %fr7L,%fr7L,%fr31

        LDD     -72(%sp),%r31

        LDD     UN_EIGHT(%r24),%r28

        ADD,DC  %r3,%r19,%r19       ; Cycle 6

        LDD     -96(%sp),%r20

        FSTD    %fr28,-80(%sp)

        ADD     %r19,%r28,%r28      ; Cycle 7

        FSTD    %fr24,-64(%sp)

        ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8

        STD     %r28,UN_EIGHT(%r24)

$ENDDIAGLOOP

        ADD,DC  %r0,%r22,%r22

        CMPIB,= 0,%r26,$ONEMOREDIAG

        SHRPD   %r31,%r0,31,%r3

; Shutdown code, first stage.

        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)

        LDD     0(%r24),%r28

        SHRPD   %r0,%r31,31,%r4     ; Cycle 2

        ADD     %r3,%r22,%r3

        ADD,DC  %r0,%r20,%r20       ; Cycle 3

        LDD     -80(%sp),%r21

        ADD     %r3,%r28,%r3

        LDD     -64(%sp),%r29       ; Cycle 4

        STD     %r3,0(%r24)

        LDD     EIGHT(%r24),%r1     ; Cycle 5

        LDO     SIXTEEN(%r25),%r25  ; Cycle 6

        LDD     -104(%sp),%r19

        ADD,DC  %r4,%r20,%r20

        ADD     %r20,%r1,%r1        ; Cycle 7

        ADD,DC  %r0,%r21,%r21       ; Cycle 8

        STD     %r1,EIGHT(%r24)

; Shutdown code, second stage.

        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)

        LDO     THIRTY_TWO(%r24),%r24

        LDD     UN_SIXTEEN(%r24),%r1

        SHRPD   %r0,%r29,31,%r3      ; Cycle 2

        ADD     %r4,%r21,%r4

        ADD,DC  %r0,%r19,%r19       ; Cycle 3

        ADD     %r4,%r1,%r4

        STD     %r4,UN_SIXTEEN(%r24); Cycle 4

        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5

        ADD,DC  %r3,%r19,%r19       ; Cycle 6

        ADD     %r19,%r28,%r28      ; Cycle 7

        ADD,DC  %r0,%r0,%r22        ; Cycle 8

        CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit

        STD     %r28,UN_EIGHT(%r24)

; Final carry propagation

$FDIAG2

        LDO     EIGHT(%r24),%r24

        LDD     UN_EIGHT(%r24),%r26

        ADDI    1,%r26,%r26

        CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.

        STD     %r26,UN_EIGHT(%r24)

        B   $Z0

NOP

; Here is the code that handles the difficult case N=1.

; We do the usual trick -- branch out of the startup code at appropriate

; points, and branch into the shutdown code.

$DIAG_N_IS_ONE

        LDD     -88(%sp),%r22

        LDD     -72(%sp),%r31

        B       $JOINDIAG

        LDD     -96(%sp),%r20

; We came out of the unrolled loop with wrong parity.  Do one more

; single cycle.  This is the "alternate body".  It will, of course,

; give us opposite registers from the other case, so we need

; completely different shutdown code.

$ONEMOREDIAG

        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)

        LDD     0(%r24),%r28

        FLDD    0(%r25),%fr7        ; Cycle 2

        SHRPD   %r0,%r31,31,%r4

        ADD     %r3,%r22,%r3

        ADD,DC  %r0,%r20,%r20       ; Cycle 3

        LDD     -80(%sp),%r21

        ADD     %r3,%r28,%r3

        LDD     -64(%sp),%r29       ; Cycle 4

        STD     %r3,0(%r24)

        XMPYU   %fr7R,%fr7R,%fr29

        LDD     EIGHT(%r24),%r1     ; Cycle 5

        XMPYU   %fr7L,%fr7R,%fr27

        XMPYU   %fr7L,%fr7L,%fr30

        LDD     -104(%sp),%r19      ; Cycle 6

        FSTD    %fr29,-88(%sp)

        ADD,DC  %r4,%r20,%r20

        FSTD    %fr27,-72(%sp)      ; Cycle 7

        ADD     %r20,%r1,%r1

        ADD,DC  %r0,%r21,%r21       ; Cycle 8

        STD     %r1,EIGHT(%r24)

; Shutdown code, first stage.

        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)

        LDO     THIRTY_TWO(%r24),%r24

        FSTD    %fr30,-96(%sp)

        LDD     UN_SIXTEEN(%r24),%r1

        SHRPD   %r0,%r29,31,%r3     ; Cycle 2

        ADD     %r4,%r21,%r4

        ADD,DC  %r0,%r19,%r19       ; Cycle 3

        LDD     -88(%sp),%r22

        ADD     %r4,%r1,%r4

        LDD     -72(%sp),%r31       ; Cycle 4

        STD     %r4,UN_SIXTEEN(%r24)

        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5

        LDD     -96(%sp),%r20       ; Cycle 6

        ADD,DC  %r3,%r19,%r19

        ADD     %r19,%r28,%r28      ; Cycle 7

        ADD,DC  %r0,%r22,%r22       ; Cycle 8

        STD     %r28,UN_EIGHT(%r24)

; Shutdown code, second stage.

$JOINDIAG

        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)

        LDD     0(%r24),%r28

        SHRPD   %r0,%r31,31,%r4     ; Cycle 2

        ADD     %r3,%r22,%r3

        ADD,DC  %r0,%r20,%r20       ; Cycle 3

        ADD     %r3,%r28,%r3

        STD     %r3,0(%r24)         ; Cycle 4

        LDD     EIGHT(%r24),%r1     ; Cycle 5

        ADD,DC  %r4,%r20,%r20

        ADD     %r20,%r1,%r1        ; Cycle 7

        ADD,DC  %r0,%r0,%r21        ; Cycle 8

        CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit

        STD     %r1,EIGHT(%r24)

; Final carry propagation

$FDIAG1

        LDO     EIGHT(%r24),%r24

        LDD     EIGHT(%r24),%r26

        ADDI    1,%r26,%r26

        CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.

        STD     %r26,EIGHT(%r24)

$Z0

        LDW     -124(%sp),%r4

        BVE     (%r2)

        .EXIT

        LDW,MB  -128(%sp),%r3

        .PROCEND

;	.ALLOW

        .SPACE         $TEXT$

        .SUBSPA        $CODE$

#ifdef LITTLE_WORDIAN

#ifdef __GNUC__

; GNU-as (as of 2.19) does not support LONG_RETURN

        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR

        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR

#else

        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN

        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN

#endif

#else

        .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN

        .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN

#endif

        .END

; How to use "maxpy_PA20_little" and "maxpy_PA20_big"

; The routine "maxpy_PA20_little" or "maxpy_PA20_big"

; performs a 64-bit x any-size multiply, and adds the

; result to an area of memory.  That is, it performs

; something like

;      A B C D

;    *       Z

;   __________

;    P Q R S T

; and then adds the "PQRST" vector into an area of memory,

; handling all carries.

; Digression on nomenclature and endian-ness:

; Each of the capital letters in the above represents a 64-bit

; quantity.  That is, you could think of the discussion as

; being in terms of radix-16-quintillion arithmetic.  The data

; type being manipulated is "unsigned long long int".  This

; requires the 64-bit extension of the HP-UX C compiler,

; available at release 10.  You need these compiler flags to

; enable these extensions:

;       -Aa +e +DA2.0 +DS2.0

; (The first specifies ANSI C, the second enables the

; extensions, which are beyond ANSI C, and the third and

; fourth tell the compiler to use whatever features of the

; PA2.0 architecture it wishes, in order to made the code more

; efficient.  Since the presence of the assembly code will

; make the program unable to run on anything less than PA2.0,

; you might as well gain the performance enhancements in the C

; code as well.)

; Questions of "endian-ness" often come up, usually in the

; context of byte ordering in a word.  These routines have a

; similar issue, that could be called "wordian-ness".

; Independent of byte ordering (PA is always big-endian), one

; can make two choices when representing extremely large

; numbers as arrays of 64-bit doublewords in memory.

; "Little-wordian" layout means that the least significant

; word of a number is stored at the lowest address.

;   MSW     LSW

;    |       |

;    V       V

;    A B C D E

;    ^     ^ ^

;    |     | |____ address 0

;    |     |

;    |     |_______address 8

;    |

;    address 32

; "Big-wordian" means that the most significant word is at the

; lowest address.

;   MSW     LSW

;    |       |

;    V       V

;    A B C D E

;    ^     ^ ^

;    |     | |____ address 32

;    |     |

;    |     |_______address 24

;    |

;    address 0

; When you compile the file, you must specify one or the other, with

; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".

;     Incidentally, you assemble this file as part of your

;     project with the same C compiler as the rest of the program.

;     My "makefile" for a superprecision arithmetic package has

;     the following stuff:

;     # definitions:

;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1

;     CFLAGS = +O3

;     LDFLAGS = -L /usr/lib -Wl,-aarchive

;     # general build rule for ".s" files:

;     .s.o:

;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN

;     # Now any bind step that calls for pa20.o will assemble pa20.s

; End of digression, back to arithmetic:

; The way we multiply two huge numbers is, of course, to multiply

; the "ABCD" vector by each of the "WXYZ" doublewords, adding

; the result vectors with increasing offsets, the way we learned

; in school, back before we all used calculators:

;            A B C D

;          * W X Y Z

;         __________

;          P Q R S T

;        E F G H I

;      M N O P Q

;  + R S T U V

;    _______________

;    F I N A L S U M

; So we call maxpy_PA20_big (in my case; my package is

; big-wordian) repeatedly, giving the W, X, Y, and Z arguments

; in turn as the "scalar", and giving the "ABCD" vector each

; time.  We direct it to add its result into an area of memory

; that we have cleared at the start.  We skew the exact

; location into that area with each call.

; The prototype for the function is

; extern void maxpy_PA20_big(

;    int length,        /* Number of doublewords in the multiplicand vector. */

;    const long long int *scalaraddr,    /* Address to fetch the scalar. */

;    const long long int *multiplicand,  /* The multiplicand vector. */

;    long long int *result);             /* Where to accumulate the result. */

; (You should place a copy of this prototype in an include file

; or in your C file.)

; Now, IN ALL CASES, the given address for the multiplicand or

; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.

; That word is, of course, the word at which the routine

; starts processing.  "maxpy_PA20_little" then increases the

; addresses as it computes.  "maxpy_PA20_big" decreases them.

; In our example above, "length" would be 4 in each case.

; "multiplicand" would be the "ABCD" vector.  Specifically,

; the address of the element "D".  "scalaraddr" would be the

; address of "W", "X", "Y", or "Z" on the four calls that we

; would make.  (The order doesn't matter, of course.)

; "result" would be the appropriate address in the result

; area.  When multiplying by "Z", that would be the least

; significant word.  When multiplying by "Y", it would be the

; next higher word (8 bytes higher if little-wordian; 8 bytes

; lower if big-wordian), and so on.  The size of the result

; area must be the the sum of the sizes of the multiplicand

; and multiplier vectors, and must be initialized to zero

; before we start.

; Whenever the routine adds its partial product into the result

; vector, it follows carry chains as far as they need to go.

; Here is the super-precision multiply routine that I use for

; my package.  The package is big-wordian.  I have taken out

; handling of exponents (it's a floating point package):

; static void mul_PA20(

;   int size,

;   const long long int *arg1,

;   const long long int *arg2,

;   long long int *result)

; {

;    int i;

;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;

;    for (i=0 ; i<size ; i++) {

;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);

;    }

; }