mpi_sse2.s - mozsearch

Enable keyboard shortcuts

# This Source Code Form is subject to the terms of the Mozilla Public

# License, v. 2.0. If a copy of the MPL was not distributed with this

# file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifdef DARWIN

#define s_mpv_mul_d          _s_mpv_mul_d

#define s_mpv_mul_d_add      _s_mpv_mul_d_add

#define s_mpv_mul_d_add_prop _s_mpv_mul_d_add_prop

#define s_mpv_sqr_add_prop   _s_mpv_sqr_add_prop

#define s_mpv_div_2dx1d      _s_mpv_div_2dx1d

#define TYPE_FUNCTION(x)

#else

#define TYPE_FUNCTION(x) .type x, @function

#endif

.text

 #  ebp - 8:    caller's esi

 #  ebp - 4:    caller's edi

 #  ebp + 0:    caller's ebp

 #  ebp + 4:    return address

 #  ebp + 8:    a       argument

 #  ebp + 12:   a_len   argument

 #  ebp + 16:   b       argument

 #  ebp + 20:   c       argument

 #  registers:

 #      ebx:

 #      ecx:    a_len

 #      esi:    a ptr

 #      edi:    c ptr

.globl s_mpv_mul_d

.private_extern s_mpv_mul_d

TYPE_FUNCTION(s_mpv_mul_d)

s_mpv_mul_d:

    push   %ebp

    mov    %esp, %ebp

    push   %edi

    push   %esi

    psubq  %mm2, %mm2           # carry = 0

    mov    12(%ebp), %ecx       # ecx = a_len

    movd   16(%ebp), %mm1       # mm1 = b

    mov    20(%ebp), %edi

    cmp    $0, %ecx

    je     2f                   # jmp if a_len == 0

    mov    8(%ebp), %esi        # esi = a

cld

1:

    movd   0(%esi), %mm0        # mm0 = *a++

    add    $4, %esi

    pmuludq %mm1, %mm0          # mm0 = b * *a++

    paddq  %mm0, %mm2           # add the carry

    movd   %mm2, 0(%edi)        # store the 32bit result

    add    $4, %edi

    psrlq  $32, %mm2            # save the carry

    dec    %ecx                 # --a_len

    jnz    1b                   # jmp if a_len != 0

2:

    movd   %mm2, 0(%edi)        # *c = carry

    emms

    pop    %esi

    pop    %edi

    leave

ret

nop

 #  ebp - 8:    caller's esi

 #  ebp - 4:    caller's edi

 #  ebp + 0:    caller's ebp

 #  ebp + 4:    return address

 #  ebp + 8:    a       argument

 #  ebp + 12:   a_len   argument

 #  ebp + 16:   b       argument

 #  ebp + 20:   c       argument

 #  registers:

 #      ebx:

 #      ecx:    a_len

 #      esi:    a ptr

 #      edi:    c ptr

.globl s_mpv_mul_d_add

.private_extern s_mpv_mul_d_add

TYPE_FUNCTION(s_mpv_mul_d_add)

s_mpv_mul_d_add:

    push   %ebp

    mov    %esp, %ebp

    push   %edi

    push   %esi

    psubq  %mm2, %mm2           # carry = 0

    mov    12(%ebp), %ecx       # ecx = a_len

    movd   16(%ebp), %mm1       # mm1 = b

    mov    20(%ebp), %edi

    cmp    $0, %ecx

    je     2f                   # jmp if a_len == 0

    mov    8(%ebp), %esi        # esi = a

cld

1:

    movd   0(%esi), %mm0        # mm0 = *a++

    add    $4, %esi

    pmuludq %mm1, %mm0          # mm0 = b * *a++

    paddq  %mm0, %mm2           # add the carry

    movd   0(%edi), %mm0

    paddq  %mm0, %mm2           # add the carry

    movd   %mm2, 0(%edi)        # store the 32bit result

    add    $4, %edi

    psrlq  $32, %mm2            # save the carry

    dec    %ecx                 # --a_len

    jnz    1b                   # jmp if a_len != 0

2:

    movd   %mm2, 0(%edi)        # *c = carry

    emms

    pop    %esi

    pop    %edi

    leave

ret

nop

 #  ebp - 12:   caller's ebx

 #  ebp - 8:    caller's esi

 #  ebp - 4:    caller's edi

 #  ebp + 0:    caller's ebp

 #  ebp + 4:    return address

 #  ebp + 8:    a       argument

 #  ebp + 12:   a_len   argument

 #  ebp + 16:   b       argument

 #  ebp + 20:   c       argument

 #  registers:

 #      eax:

 #      ebx:    carry

 #      ecx:    a_len

 #      esi:    a ptr

 #      edi:    c ptr

.globl s_mpv_mul_d_add_prop

.private_extern s_mpv_mul_d_add_prop

TYPE_FUNCTION(s_mpv_mul_d_add_prop)

s_mpv_mul_d_add_prop:

    push   %ebp

    mov    %esp, %ebp

    push   %edi

    push   %esi

    push   %ebx

    psubq  %mm2, %mm2           # carry = 0

    mov    12(%ebp), %ecx       # ecx = a_len

    movd   16(%ebp), %mm1       # mm1 = b

    mov    20(%ebp), %edi

    cmp    $0, %ecx

    je     2f                   # jmp if a_len == 0

    mov    8(%ebp), %esi        # esi = a

cld

1:

    movd   0(%esi), %mm0        # mm0 = *a++

    movd   0(%edi), %mm3        # fetch the sum

    add    $4, %esi

    pmuludq %mm1, %mm0          # mm0 = b * *a++

    paddq  %mm0, %mm2           # add the carry

    paddq  %mm3, %mm2           # add *c++

    movd   %mm2, 0(%edi)        # store the 32bit result

    add    $4, %edi

    psrlq  $32, %mm2            # save the carry

    dec    %ecx                 # --a_len

    jnz    1b                   # jmp if a_len != 0

2:

    movd   %mm2, %ebx

    cmp    $0, %ebx             # is carry zero?

    jz     4f

    mov    0(%edi), %eax

    add    %ebx, %eax

    stosl

    jnc    4f

3:

    mov    0(%edi), %eax        # add in current word from *c

    adc    $0, %eax

    stosl                       # [es:edi] = ax; edi += 4;

    jc     3b

4:

    emms

    pop    %ebx

    pop    %esi

    pop    %edi

    leave

ret

nop

 #  ebp - 12:   caller's ebx

 #  ebp - 8:    caller's esi

 #  ebp - 4:    caller's edi

 #  ebp + 0:    caller's ebp

 #  ebp + 4:    return address

 #  ebp + 8:    pa      argument

 #  ebp + 12:   a_len   argument

 #  ebp + 16:   ps      argument

 #  registers:

 #      eax:

 #      ebx:    carry

 #      ecx:    a_len

 #      esi:    a ptr

 #      edi:    c ptr

.globl s_mpv_sqr_add_prop

.private_extern s_mpv_sqr_add_prop

TYPE_FUNCTION(s_mpv_sqr_add_prop)

s_mpv_sqr_add_prop:

    push   %ebp

    mov    %esp, %ebp

    push   %edi

    push   %esi

    push   %ebx

    psubq  %mm2, %mm2           # carry = 0

    mov    12(%ebp), %ecx       # ecx = a_len

    mov    16(%ebp), %edi

    cmp    $0, %ecx

    je     2f                   # jmp if a_len == 0

    mov    8(%ebp), %esi        # esi = a

cld

1:

    movd   0(%esi), %mm0        # mm0 = *a

    movd   0(%edi), %mm3        # fetch the sum

    add    $4, %esi

    pmuludq %mm0, %mm0          # mm0 = sqr(a)

    paddq  %mm0, %mm2           # add the carry

    paddq  %mm3, %mm2           # add the low word

    movd   4(%edi), %mm3

    movd   %mm2, 0(%edi)        # store the 32bit result

    psrlq  $32, %mm2

    paddq  %mm3, %mm2           # add the high word

    movd   %mm2, 4(%edi)        # store the 32bit result

    psrlq  $32, %mm2            # save the carry.

    add    $8, %edi

    dec    %ecx                 # --a_len

    jnz    1b                   # jmp if a_len != 0

2:

    movd   %mm2, %ebx

    cmp    $0, %ebx             # is carry zero?

    jz     4f

    mov    0(%edi), %eax

    add    %ebx, %eax

    stosl

    jnc    4f

3:

    mov    0(%edi), %eax        # add in current word from *c

    adc    $0, %eax

    stosl                       #  [es:edi] = ax; edi += 4;

    jc     3b

4:

    emms

    pop    %ebx

    pop    %esi

    pop    %edi

    leave

ret

nop

 # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized

 # so its high bit is 1.   This code is from NSPR.

 # mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,

 #                        mp_digit *qp, mp_digit *rp)

 #  esp +  0:   Caller's ebx

 #  esp +  4:   return address

 #  esp +  8:   Nhi     argument

 #  esp + 12:   Nlo     argument

 #  esp + 16:   divisor argument

 #  esp + 20:   qp      argument

 #  esp + 24:   rp      argument

 #  registers:

 #      eax:

 #      ebx:    carry

 #      ecx:    a_len

 #      edx:

 #      esi:    a ptr

 #      edi:    c ptr

.globl s_mpv_div_2dx1d

.private_extern s_mpv_div_2dx1d

TYPE_FUNCTION(s_mpv_div_2dx1d)

s_mpv_div_2dx1d:

       push   %ebx

       mov    8(%esp), %edx

       mov    12(%esp), %eax

       mov    16(%esp), %ebx

       div    %ebx

       mov    20(%esp), %ebx

       mov    %eax, 0(%ebx)

       mov    24(%esp), %ebx

       mov    %edx, 0(%ebx)

       xor    %eax, %eax        # return zero

       pop    %ebx

ret

nop

#ifndef DARWIN

 # Magic indicating no need for an executable stack

.section .note.GNU-stack, "", @progbits

.previous

#endif