pixman-arm-simd-asm.S

/*

 * Copyright © 2012 Raspberry Pi Foundation

 * Copyright © 2012 RISC OS Open Ltd

 * Permission to use, copy, modify, distribute, and sell this software and its

 * documentation for any purpose is hereby granted without fee, provided that

 * the above copyright notice appear in all copies and that both that

 * copyright notice and this permission notice appear in supporting

 * documentation, and that the name of the copyright holders not be used in

 * advertising or publicity pertaining to distribution of the software without

 * specific, written prior permission.  The copyright holders make no

 * representations about the suitability of this software for any purpose.  It

 * is provided "as is" without express or implied warranty.

 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS

 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY

 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN

 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING

 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

 * SOFTWARE.

 * Author:  Ben Avison (bavison@riscosopen.org)

*/

/* Prevent the stack from becoming executable */

#if defined(__linux__) && defined(__ELF__)

.section .note.GNU-stack,"",%progbits

#endif

	.text

	.arch armv6

	.object_arch armv4

	.arm

	.altmacro

	.p2align 2

#include "pixman-arm-asm.h"

#include "pixman-arm-simd-asm.h"

	pixman_syntax_unified

/* A head macro should do all processing which results in an output of up to

 * 16 bytes, as far as the final load instruction. The corresponding tail macro

 * should complete the processing of the up-to-16 bytes. The calling macro will

 * sometimes choose to insert a preload or a decrement of X between them.

 *   cond           ARM condition code for code block

 *   numbytes       Number of output bytes that should be generated this time

 *   firstreg       First WK register in which to place output

 *   unaligned_src  Whether to use non-wordaligned loads of source image

 *   unaligned_mask Whether to use non-wordaligned loads of mask image

 *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output

*/

.macro blit_init

        line_saved_regs STRIDE_D, STRIDE_S

.endm

.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src

.endm

.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment

    WK4     .req    STRIDE_D

    WK5     .req    STRIDE_S

    WK6     .req    MASK

    WK7     .req    STRIDE_M

110:    pixld   , 16, 0, SRC, \unaligned_src

        pixld   , 16, 4, SRC, \unaligned_src

        pld     [SRC, SCRATCH]

        pixst   , 16, 0, DST

        pixst   , 16, 4, DST

        subs    X, X, #32*8/src_bpp

        bhs     110b

    .unreq  WK4

    .unreq  WK5

    .unreq  WK6

    .unreq  WK7

.endm

generate_composite_function \

    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \

    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \

    4, /* prefetch distance */ \

    blit_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    blit_process_head, \

    nop_macro, /* process tail */ \

    blit_inner_loop

generate_composite_function \

    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \

    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \

    4, /* prefetch distance */ \

    blit_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    blit_process_head, \

    nop_macro, /* process tail */ \

    blit_inner_loop

generate_composite_function \

    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \

    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \

    3, /* prefetch distance */ \

    blit_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    blit_process_head, \

    nop_macro, /* process tail */ \

    blit_inner_loop

/******************************************************************************/

.macro src_n_8888_init

        ldr     SRC, [sp, #ARGS_STACK_OFFSET]

        mov     STRIDE_S, SRC

        mov     MASK, SRC

        mov     STRIDE_M, SRC

.endm

.macro src_n_0565_init

        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]

        orr     SRC, SRC, SRC, lsl #16

        mov     STRIDE_S, SRC

        mov     MASK, SRC

        mov     STRIDE_M, SRC

.endm

.macro src_n_8_init

        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]

        orr     SRC, SRC, SRC, lsl #8

        orr     SRC, SRC, SRC, lsl #16

        mov     STRIDE_S, SRC

        mov     MASK, SRC

        mov     STRIDE_M, SRC

.endm

.macro fill_process_tail  cond, numbytes, firstreg

    WK4     .req    SRC

    WK5     .req    STRIDE_S

    WK6     .req    MASK

    WK7     .req    STRIDE_M

        pixst   \cond, \numbytes, 4, DST

    .unreq  WK4

    .unreq  WK5

    .unreq  WK6

    .unreq  WK7

.endm

generate_composite_function \

    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \

    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \

    0, /* prefetch distance doesn't apply */ \

    src_n_8888_init \

    nop_macro, /* newline */ \

    nop_macro /* cleanup */ \

    nop_macro /* process head */ \

    fill_process_tail

generate_composite_function \

    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \

    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \

    0, /* prefetch distance doesn't apply */ \

    src_n_0565_init \

    nop_macro, /* newline */ \

    nop_macro /* cleanup */ \

    nop_macro /* process head */ \

    fill_process_tail

generate_composite_function \

    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \

    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \

    0, /* prefetch distance doesn't apply */ \

    src_n_8_init \

    nop_macro, /* newline */ \

    nop_macro /* cleanup */ \

    nop_macro /* process head */ \

    fill_process_tail

/******************************************************************************/

.macro src_x888_8888_pixel, cond, reg

        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000

.endm

.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src

.endm

.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg

        src_x888_8888_pixel \cond, %(\firstreg+0)

 .if \numbytes >= 8

        src_x888_8888_pixel \cond, %(\firstreg+1)

  .if \numbytes == 16

        src_x888_8888_pixel \cond, %(\firstreg+2)

        src_x888_8888_pixel \cond, %(\firstreg+3)

  .endif

 .endif

.endm

generate_composite_function \

    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \

    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \

    3, /* prefetch distance */ \

    nop_macro, /* init */ \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    pixman_composite_src_x888_8888_process_head, \

    pixman_composite_src_x888_8888_process_tail

/******************************************************************************/

.macro src_0565_8888_init

        /* Hold loop invariants in MASK and STRIDE_M */

        ldr     MASK, =0x07E007E0

        mov     STRIDE_M, #0xFF000000

        /* Set GE[3:0] to 1010 so SEL instructions do what we want */

        ldr     SCRATCH, =0x80008000

        uadd8   SCRATCH, SCRATCH, SCRATCH

.endm

.macro src_0565_8888_2pixels, reg1, reg2

        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000

        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb

        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg

        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000

        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG

        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000

        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000

        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000

        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------

        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------

        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg

        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------

        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------

        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb

        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB

.endm

/* This version doesn't need STRIDE_M, but is one instruction longer.

   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?

        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000

        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb

        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg

        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB

        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000

        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb

        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000

        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000

        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB

        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb

        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB

        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb

        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB

        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb

        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB

        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb

*/

.macro src_0565_8888_1pixel, reg

        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb

        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000

        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000

        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000

        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb

        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000

        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb

        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb

        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb

.endm

.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

 .if \numbytes == 16

        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src

 .elseif \numbytes == 8

        pixld   , 4, \firstreg, SRC, \unaligned_src

 .elseif \numbytes == 4

        pixld   , 2, \firstreg, SRC, \unaligned_src

 .endif

.endm

.macro src_0565_8888_process_tail   cond, numbytes, firstreg

 .if \numbytes == 16

        src_0565_8888_2pixels \firstreg, %(\firstreg+1)

        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)

 .elseif \numbytes == 8

        src_0565_8888_2pixels \firstreg, %(\firstreg+1)

 .else

        src_0565_8888_1pixel \firstreg

 .endif

.endm

generate_composite_function \

    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \

    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \

    3, /* prefetch distance */ \

    src_0565_8888_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    src_0565_8888_process_head, \

    src_0565_8888_process_tail

/******************************************************************************/

.macro src_x888_0565_init

        /* Hold loop invariant in MASK */

        ldr     MASK, =0x001F001F

        line_saved_regs  STRIDE_S, ORIG_W

.endm

.macro src_x888_0565_1pixel  s, d

        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb

        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000

        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb

        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb

        /* Top 16 bits are discarded during the following STRH */

.endm

.macro src_x888_0565_2pixels  slo, shi, d, tmp

        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000

        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB

        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb

        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB

        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB

        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000

        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb

        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb

        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb

.endm

.macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

        WK4     .req    STRIDE_S

        WK5     .req    STRIDE_M

        WK6     .req    WK3

        WK7     .req    ORIG_W

 .if \numbytes == 16

        pixld   , 16, 4, SRC, 0

        src_x888_0565_2pixels  4, 5, 0, 0

        pixld   , 8, 4, SRC, 0

        src_x888_0565_2pixels  6, 7, 1, 1

        pixld   , 8, 6, SRC, 0

 .else

        pixld   , \numbytes*2, 4, SRC, 0

 .endif

.endm

.macro src_x888_0565_process_tail   cond, numbytes, firstreg

 .if \numbytes == 16

        src_x888_0565_2pixels  4, 5, 2, 2

        src_x888_0565_2pixels  6, 7, 3, 4

 .elseif \numbytes == 8

        src_x888_0565_2pixels  4, 5, 1, 1

        src_x888_0565_2pixels  6, 7, 2, 2

 .elseif \numbytes == 4

        src_x888_0565_2pixels  4, 5, 1, 1

 .else

        src_x888_0565_1pixel  4, 1

 .endif

 .if \numbytes == 16

        pixst   , \numbytes, 0, DST

 .else

        pixst   , \numbytes, 1, DST

 .endif

        .unreq  WK4

        .unreq  WK5

        .unreq  WK6

        .unreq  WK7

.endm

generate_composite_function \

    pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \

    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \

    3, /* prefetch distance */ \

    src_x888_0565_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    src_x888_0565_process_head, \

    src_x888_0565_process_tail

/******************************************************************************/

.macro add_8_8_8pixels  cond, dst1, dst2

        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK

        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M

.endm

.macro add_8_8_4pixels  cond, dst

        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK

.endm

.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

    WK4     .req    MASK

    WK5     .req    STRIDE_M

 .if \numbytes == 16

        pixld   \cond, 8, 4, SRC, \unaligned_src

        pixld   \cond, 16, \firstreg, DST, 0

        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)

        pixld   \cond, 8, 4, SRC, \unaligned_src

 .else

        pixld   \cond, \numbytes, 4, SRC, \unaligned_src

        pixld   \cond, \numbytes, \firstreg, DST, 0

 .endif

    .unreq  WK4

    .unreq  WK5

.endm

.macro add_8_8_process_tail  cond, numbytes, firstreg

 .if \numbytes == 16

        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)

 .elseif \numbytes == 8

        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)

 .else

        add_8_8_4pixels \cond, \firstreg

 .endif

.endm

generate_composite_function \

    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \

    2, /* prefetch distance */ \

    nop_macro, /* init */ \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    add_8_8_process_head, \

    add_8_8_process_tail

/******************************************************************************/

.macro over_8888_8888_init

        /* Hold loop invariant in MASK */

        ldr     MASK, =0x00800080

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, MASK, MASK

        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W

.endm

.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

    WK4     .req    STRIDE_D

    WK5     .req    STRIDE_S

    WK6     .req    STRIDE_M

    WK7     .req    ORIG_W

        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src

        pixld   , \numbytes, \firstreg, DST, 0

    .unreq  WK4

    .unreq  WK5

    .unreq  WK6

    .unreq  WK7

.endm

.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3

        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */

        teq     WK\()\reg0, #0

 .if \numbytes > 4

        teqeq   WK\()\reg1, #0

  .if \numbytes > 8

        teqeq   WK\()\reg2, #0

        teqeq   WK\()\reg3, #0

  .endif

 .endif

.endm

.macro over_8888_8888_prepare  next

        mov     WK\()\next, WK\()\next, lsr #24

.endm

.macro over_8888_8888_1pixel src, dst, offset, next

        /* src = destination component multiplier */

        rsb     WK\()\src, WK\()\src, #255

        /* Split even/odd bytes of dst into SCRATCH/dst */

        uxtb16  SCRATCH, WK\()\dst

        uxtb16  WK\()\dst, WK\()\dst, ror #8

        /* Multiply through, adding 0.5 to the upper byte of result for rounding */

        mla     SCRATCH, SCRATCH, WK\()\src, MASK

        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK

        /* Where we would have had a stall between the result of the first MLA and the shifter input,

         * reload the complete source pixel */

        ldr     WK\()\src, [SRC, #\offset]

        /* Multiply by 257/256 to approximate 256/255 */

        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

        /* In this stall, start processing the next pixel */

 .if \offset < -4

        mov     WK\()\next, WK\()\next, lsr #24

 .endif

        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8

        /* Recombine even/odd bytes of multiplied destination */

        mov     SCRATCH, SCRATCH, ror #8

        sel     WK\()\dst, SCRATCH, WK\()\dst

        /* Saturated add of source to multiplied destination */

        uqadd8  WK\()\dst, WK\()\dst, WK\()\src

.endm

.macro over_8888_8888_process_tail  cond, numbytes, firstreg

    WK4     .req    STRIDE_D

    WK5     .req    STRIDE_S

    WK6     .req    STRIDE_M

    WK7     .req    ORIG_W

        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)

        beq     10f

        over_8888_8888_prepare  %(4+\firstreg)

 .set PROCESS_REG, \firstreg

 .set PROCESS_OFF, -\numbytes

 .rept \numbytes / 4

        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)

  .set PROCESS_REG, PROCESS_REG+1

  .set PROCESS_OFF, PROCESS_OFF+4

 .endr

        pixst   , \numbytes, \firstreg, DST

10:

    .unreq  WK4

    .unreq  WK5

    .unreq  WK6

    .unreq  WK7

.endm

generate_composite_function \

    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \

    2, /* prefetch distance */ \

    over_8888_8888_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    over_8888_8888_process_head, \

    over_8888_8888_process_tail

/******************************************************************************/

/* Multiply each byte of a word by a byte.

 * Useful when there aren't any obvious ways to fill the stalls with other instructions.

 * word  Register containing 4 bytes

 * byte  Register containing byte multiplier (bits 8-31 must be 0)

 * tmp   Scratch register

 * half  Register containing the constant 0x00800080

 * GE[3:0] bits must contain 0101

*/

.macro mul_8888_8  word, byte, tmp, half

        /* Split even/odd bytes of word apart */

        uxtb16  \tmp, \word

        uxtb16  \word, \word, ror #8

        /* Multiply bytes together with rounding, then by 257/256 */

        mla     \tmp, \tmp, \byte, \half

        mla     \word, \word, \byte, \half /* 1 stall follows */

        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */

        uxtab16 \word, \word, \word, ror #8

        /* Recombine bytes */

        mov     \tmp, \tmp, ror #8

        sel     \word, \tmp, \word

.endm

/******************************************************************************/

.macro over_8888_n_8888_init

        /* Mask is constant */

        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]

        /* Hold loop invariant in STRIDE_M */

        ldr     STRIDE_M, =0x00800080

        /* We only want the alpha bits of the constant mask */

        mov     MASK, MASK, lsr #24

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, STRIDE_M, STRIDE_M

        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W

.endm

.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

    WK4     .req    Y

    WK5     .req    STRIDE_D

    WK6     .req    STRIDE_S

    WK7     .req    ORIG_W

        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src

        pixld   , \numbytes, \firstreg, DST, 0

    .unreq  WK4

    .unreq  WK5

    .unreq  WK6

    .unreq  WK7

.endm

.macro over_8888_n_8888_1pixel src, dst

        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M

        sub     WK7, WK6, WK\()\src, lsr #24

        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M

        uqadd8  WK\()\dst, WK\()\dst, WK\()\src

.endm

.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg

    WK4     .req    Y

    WK5     .req    STRIDE_D

    WK6     .req    STRIDE_S

    WK7     .req    ORIG_W

        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)

        beq     10f

        mov     WK6, #255

 .set PROCESS_REG, \firstreg

 .rept \numbytes / 4

  .if \numbytes == 16 && PROCESS_REG == 2

        /* We're using WK6 and WK7 as temporaries, so half way through

         * 4 pixels, reload the second two source pixels but this time

         * into WK4 and WK5 */

        ldmdb   SRC, {WK4, WK5}

  .endif

        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)

  .set PROCESS_REG, PROCESS_REG+1

 .endr

        pixst   , \numbytes, \firstreg, DST

10:

    .unreq  WK4

    .unreq  WK5

    .unreq  WK6

    .unreq  WK7

.endm

generate_composite_function \

    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \

    2, /* prefetch distance */ \

    over_8888_n_8888_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    over_8888_n_8888_process_head, \

    over_8888_n_8888_process_tail

/******************************************************************************/

.macro over_n_8_8888_init

        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */

        ldr     SRC, [sp, #ARGS_STACK_OFFSET]

        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */

        ldr     SCRATCH, =0x00800080

        uxtb16  STRIDE_S, SRC

        uxtb16  SRC, SRC, ror #8

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, SCRATCH, SCRATCH

        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W

.endm

.macro over_n_8_8888_newline

        ldr     STRIDE_D, =0x00800080

        b       1f

 .ltorg

1:

.endm

.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

    WK4     .req    STRIDE_M

        pixld   , \numbytes/4, 4, MASK, \unaligned_mask

        pixld   , \numbytes, \firstreg, DST, 0

    .unreq  WK4

.endm

.macro over_n_8_8888_1pixel src, dst

        uxtb    Y, WK4, ror #\src*8

        /* Trailing part of multiplication of source */

        mla     SCRATCH, STRIDE_S, Y, STRIDE_D

        mla     Y, SRC, Y, STRIDE_D

        mov     ORIG_W, #255

        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

        uxtab16 Y, Y, Y, ror #8

        mov     SCRATCH, SCRATCH, ror #8

        sub     ORIG_W, ORIG_W, Y, lsr #24

        sel     Y, SCRATCH, Y

        /* Then multiply the destination */

        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D

        uqadd8  WK\()\dst, WK\()\dst, Y

.endm

.macro over_n_8_8888_process_tail  cond, numbytes, firstreg

    WK4     .req    STRIDE_M

        teq     WK4, #0

        beq     10f

 .set PROCESS_REG, \firstreg

 .rept \numbytes / 4

        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)

  .set PROCESS_REG, PROCESS_REG+1

 .endr

        pixst   , \numbytes, \firstreg, DST

10:

    .unreq  WK4

.endm

generate_composite_function \

    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \

    2, /* prefetch distance */ \

    over_n_8_8888_init, \

    over_n_8_8888_newline, \

    nop_macro, /* cleanup */ \

    over_n_8_8888_process_head, \

    over_n_8_8888_process_tail

/******************************************************************************/

.macro over_reverse_n_8888_init

        ldr     SRC, [sp, #ARGS_STACK_OFFSET]

        ldr     MASK, =0x00800080

        /* Split source pixel into RB/AG parts */

        uxtb16  STRIDE_S, SRC

        uxtb16  STRIDE_M, SRC, ror #8

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, MASK, MASK

        line_saved_regs  STRIDE_D, ORIG_W

.endm

.macro over_reverse_n_8888_newline

        mov     STRIDE_D, #0xFF

.endm

.macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

        pixld   , \numbytes, \firstreg, DST, 0

.endm

.macro over_reverse_n_8888_1pixel  d, is_only

        teq     WK\()\d, #0

        beq     8f       /* replace with source */

        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24

 .if \is_only == 1

        beq     49f      /* skip store */

 .else

        beq     9f       /* write same value back */

 .endif

        mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */

        mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */

        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

        uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8

        mov     SCRATCH, SCRATCH, ror #8

        sel     ORIG_W, SCRATCH, ORIG_W

        uqadd8  WK\()\d, WK\()\d, ORIG_W

        b       9f

8:      mov     WK\()\d, SRC

9:

.endm

.macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4

 .if \numbytes == 4

        over_reverse_n_8888_1pixel  \reg1, 1

 .else

        and     SCRATCH, WK\()\reg1, WK\()\reg2

  .if \numbytes == 16

        and     SCRATCH, SCRATCH, WK\()\reg3

        and     SCRATCH, SCRATCH, WK\()\reg4

  .endif

        mvns    SCRATCH, SCRATCH, asr #24

        beq     49f /* skip store if all opaque */

        over_reverse_n_8888_1pixel  \reg1, 0

        over_reverse_n_8888_1pixel  \reg2, 0

  .if \numbytes == 16

        over_reverse_n_8888_1pixel  \reg3, 0

        over_reverse_n_8888_1pixel  \reg4, 0

  .endif

 .endif

        pixst   , \numbytes, \reg1, DST

49:

.endm

.macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg

        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)

.endm

generate_composite_function \

    pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \

    3, /* prefetch distance */ \

    over_reverse_n_8888_init, \

    over_reverse_n_8888_newline, \

    nop_macro, /* cleanup */ \

    over_reverse_n_8888_process_head, \

    over_reverse_n_8888_process_tail

/******************************************************************************/

.macro over_white_8888_8888_ca_init

        HALF    .req    SRC

        TMP0    .req    STRIDE_D

        TMP1    .req    STRIDE_S

        TMP2    .req    STRIDE_M

        TMP3    .req    ORIG_W

        WK4     .req    SCRATCH

        line_saved_regs STRIDE_D, STRIDE_M, ORIG_W

        ldr     SCRATCH, =0x800080

        mov     HALF, #0x80

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, SCRATCH, SCRATCH

        .set DST_PRELOAD_BIAS, 8

.endm

.macro over_white_8888_8888_ca_cleanup

        .set DST_PRELOAD_BIAS, 0

        .unreq  HALF

        .unreq  TMP0

        .unreq  TMP1

        .unreq  TMP2

        .unreq  TMP3

        .unreq  WK4

.endm

.macro over_white_8888_8888_ca_combine  m, d

        uxtb16  TMP1, TMP0                /* rb_notmask */

        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */

        smlatt  TMP3, TMP2, TMP1, HALF    /* red */

        smlabb  TMP2, TMP2, TMP1, HALF    /* blue */

        uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */

        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */

        smlatt  \d, TMP1, TMP0, HALF      /* alpha */

        smlabb  TMP1, TMP1, TMP0, HALF    /* green */

        pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */

        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */

        uxtab16 TMP0, TMP0, TMP0, ror #8

        uxtab16 TMP1, TMP1, TMP1, ror #8

        mov     TMP0, TMP0, ror #8

        sel     \d, TMP0, TMP1

        uqadd8  \d, \d, \m                 /* d is a late result */

.endm

.macro over_white_8888_8888_ca_1pixel_head

        pixld   , 4, 1, MASK, 0

        pixld   , 4, 3, DST, 0

.endm

.macro over_white_8888_8888_ca_1pixel_tail

        mvn     TMP0, WK1

        teq     WK1, WK1, asr #32

        bne     1f

        bcc     3f

        mov     WK3, WK1

        b       2f

1:      over_white_8888_8888_ca_combine WK1, WK3

2:      pixst   , 4, 3, DST

3:

.endm

.macro over_white_8888_8888_ca_2pixels_head

        pixld   , 8, 1, MASK, 0

.endm

.macro over_white_8888_8888_ca_2pixels_tail

        pixld   , 8, 3, DST

        mvn     TMP0, WK1

        teq     WK1, WK1, asr #32

        bne     1f

        movcs   WK3, WK1

        bcs     2f

        teq     WK2, #0

        beq     5f

        b       2f

1:      over_white_8888_8888_ca_combine WK1, WK3

2:      mvn     TMP0, WK2

        teq     WK2, WK2, asr #32

        bne     3f

        movcs   WK4, WK2

        b       4f

3:      over_white_8888_8888_ca_combine WK2, WK4

4:      pixst   , 8, 3, DST

5:

.endm

.macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

 .if \numbytes == 4

        over_white_8888_8888_ca_1pixel_head

 .else

  .if \numbytes == 16

        over_white_8888_8888_ca_2pixels_head

        over_white_8888_8888_ca_2pixels_tail

  .endif

        over_white_8888_8888_ca_2pixels_head

 .endif

.endm

.macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg

 .if \numbytes == 4

        over_white_8888_8888_ca_1pixel_tail

 .else

        over_white_8888_8888_ca_2pixels_tail

 .endif

.endm

generate_composite_function \

    pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \

    2, /* prefetch distance */ \

    over_white_8888_8888_ca_init, \

    nop_macro, /* newline */ \

    over_white_8888_8888_ca_cleanup, \

    over_white_8888_8888_ca_process_head, \

    over_white_8888_8888_ca_process_tail

.macro over_n_8888_8888_ca_init

        /* Set up constants. RB_SRC and AG_SRC are in registers;

         * RB_FLDS, A_SRC, and the two HALF values need to go on the

         * stack (and the ful SRC value is already there) */

        ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]

        mov     WK0, #0x00FF0000

        orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */

        mov     WK1, #0x80             /* HALF default value */

        mov     WK2, SCRATCH, lsr #24  /* A_SRC */

        orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */

        push    {WK0-WK3}

 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16

        uxtb16  SRC, SCRATCH

        uxtb16  STRIDE_S, SCRATCH, ror #8

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, WK3, WK3

        .unreq  WK0

        .unreq  WK1

        .unreq  WK2

        .unreq  WK3

        WK0     .req    Y

        WK1     .req    STRIDE_D

        RB_SRC  .req    SRC

        AG_SRC  .req    STRIDE_S

        WK2     .req    STRIDE_M

        RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */

        A_SRC   .req    r8

        HALF    .req    r9

        WK3     .req    r10

        WK4     .req    r11

        WK5     .req    SCRATCH

        WK6     .req    ORIG_W

        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W

.endm

.macro over_n_8888_8888_ca_cleanup

        add     sp, sp, #16

 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16

        .unreq  WK0

        .unreq  WK1

        .unreq  RB_SRC

        .unreq  AG_SRC

        .unreq  WK2

        .unreq  RB_FLDS

        .unreq  A_SRC

        .unreq  HALF

        .unreq  WK3

        .unreq  WK4

        .unreq  WK5

        .unreq  WK6

        WK0     .req    r8

        WK1     .req    r9

        WK2     .req    r10

        WK3     .req    r11

.endm

.macro over_n_8888_8888_ca_1pixel_head

        pixld   , 4, 6, MASK, 0

        pixld   , 4, 0, DST, 0

.endm

.macro over_n_8888_8888_ca_1pixel_tail

        ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]

        uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */

        teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */

        bne     20f

        bcc     40f

        /* Mask is fully opaque (all channels) */

        ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */

        eors    A_SRC, A_SRC, #0xFF

        bne     10f

        /* Source is also opaque - same as src_8888_8888 */

        mov     WK0, WK6

        b       30f

10:     /* Same as over_8888_8888 */

        mul_8888_8 WK0, A_SRC, WK5, HALF

        uqadd8  WK0, WK0, WK6

        b       30f

20:     /* No simplifications possible - do it the hard way */

        uxtb16  WK2, WK6, ror #8         /* ag_mask */

        mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */

        mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */

        ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]

        uxtb16  WK5, WK0                 /* rb_dest */

        uxtab16 WK3, WK3, WK3, ror #8

        uxtb16  WK6, WK0, ror #8         /* ag_dest */

        uxtab16 WK4, WK4, WK4, ror #8

        smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */

        smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */

        bic     WK3, RB_FLDS, WK3, lsr #8

        bic     WK4, RB_FLDS, WK4, lsr #8

        pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */

        smlatt  WK0, WK5, WK3, HALF      /* red2 */

        smlabb  WK3, WK5, WK3, HALF      /* blue2 */

        uxtab16 WK1, WK1, WK1, ror #8

        smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */

        pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */

        smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */

        smlatt  WK2, WK6, WK4, HALF      /* alpha2 */

        smlabb  WK4, WK6, WK4, HALF      /* green2 */

        pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */

        uxtab16 WK3, WK3, WK3, ror #8

        pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */

        uxtab16 WK0, WK0, WK0, ror #8

        uxtab16 WK4, WK4, WK4, ror #8

        mov     WK1, WK1, ror #8

        mov     WK3, WK3, ror #8

        sel     WK2, WK1, WK0            /* recombine source*mask */

        sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */

        uqadd8  WK0, WK1, WK2            /* followed by 1 stall */

30:     /* The destination buffer is already in the L1 cache, so

         * there's little point in amalgamating writes */

        pixst   , 4, 0, DST

40:

.endm

.macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

 .rept (\numbytes / 4) - 1

        over_n_8888_8888_ca_1pixel_head

        over_n_8888_8888_ca_1pixel_tail

 .endr

        over_n_8888_8888_ca_1pixel_head

.endm

.macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg

        over_n_8888_8888_ca_1pixel_tail

.endm

pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6

        ldr     ip, [sp]

        cmp     ip, #-1

        beq     pixman_composite_over_white_8888_8888_ca_asm_armv6

        /* else drop through... */

pixman_end_asm_function

generate_composite_function \

    pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \

    2, /* prefetch distance */ \

    over_n_8888_8888_ca_init, \

    nop_macro, /* newline */ \

    over_n_8888_8888_ca_cleanup, \

    over_n_8888_8888_ca_process_head, \

    over_n_8888_8888_ca_process_tail

/******************************************************************************/

.macro in_reverse_8888_8888_init

        /* Hold loop invariant in MASK */

        ldr     MASK, =0x00800080

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, MASK, MASK

        /* Offset the source pointer: we only need the alpha bytes */

        add     SRC, SRC, #3

        line_saved_regs  ORIG_W

.endm

.macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3

        ldrb    ORIG_W, [SRC], #4

 .if \numbytes >= 8

        ldrb    WK\()\reg1, [SRC], #4

  .if \numbytes == 16

        ldrb    WK\()\reg2, [SRC], #4

        ldrb    WK\()\reg3, [SRC], #4

  .endif

 .endif

        add     DST, DST, #\numbytes

.endm

.macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)

.endm

.macro in_reverse_8888_8888_1pixel  s, d, offset, is_only

 .if \is_only != 1

        movs    \s, ORIG_W

  .if \offset != 0

        ldrb    ORIG_W, [SRC, #\offset]

  .endif

        beq     1f

        teq     STRIDE_M, #0xFF

        beq     2f

 .endif

        uxtb16  SCRATCH, \d                 /* rb_dest */

        uxtb16  \d, \d, ror #8               /* ag_dest */

        mla     SCRATCH, SCRATCH, \s, MASK

        mla     \d, \d, \s, MASK

        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

        uxtab16 \d, \d, \d, ror #8

        mov     SCRATCH, SCRATCH, ror #8

        sel     \d, SCRATCH, \d

        b       2f

 .if \offset == 0

48:     /* Last mov d,#0 of the set - used as part of shortcut for

         * source values all 0 */

 .endif

1:      mov     \d, #0

2:

.endm

.macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4

 .if \numbytes == 4

        teq     ORIG_W, ORIG_W, asr #32

        ldrne   WK\()\reg1, [DST, #-4]

 .elseif \numbytes == 8

        teq     ORIG_W, WK\()\reg1

        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */

        ldmdbne DST, {WK\()\reg1-WK\()\reg2}

 .else

        teq     ORIG_W, WK\()\reg1

        teqeq   ORIG_W, WK\()\reg2

        teqeq   ORIG_W, WK\()\reg3

        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */

        ldmdbne DST, {WK\()\reg1-WK\()\reg4}

 .endif

        cmnne   DST, #0   /* clear C if NE */

        bcs     49f       /* no writes to dest if source all -1 */

        beq     48f       /* set dest to all 0 if source all 0 */

 .if \numbytes == 4

        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1

        str     WK\()\reg1, [DST, #-4]

 .elseif \numbytes == 8

        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0

        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0

        stmdb   DST, {WK\()\reg1-WK\()\reg2}

 .else

        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0

        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0

        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0

        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0

        stmdb   DST, {WK\()\reg1-WK\()\reg4}

 .endif

49:

.endm

.macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg

        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)

.endm

generate_composite_function \

    pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \

    2, /* prefetch distance */ \

    in_reverse_8888_8888_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    in_reverse_8888_8888_process_head, \

    in_reverse_8888_8888_process_tail

/******************************************************************************/

.macro over_n_8888_init

        ldr     SRC, [sp, #ARGS_STACK_OFFSET]

        /* Hold loop invariant in MASK */

        ldr     MASK, =0x00800080

        /* Hold multiplier for destination in STRIDE_M */

        mov     STRIDE_M, #255

        sub     STRIDE_M, STRIDE_M, SRC, lsr #24

        /* Set GE[3:0] to 0101 so SEL instructions do what we want */

        uadd8   SCRATCH, MASK, MASK

.endm

.macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

        pixld   , \numbytes, \firstreg, DST, 0

.endm

.macro over_n_8888_1pixel dst

        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK

        uqadd8  WK\()\dst, WK\()\dst, SRC

.endm

.macro over_n_8888_process_tail  cond, numbytes, firstreg

 .set PROCESS_REG, \firstreg

 .rept \numbytes / 4

        over_n_8888_1pixel %(PROCESS_REG)

  .set PROCESS_REG, PROCESS_REG+1

 .endr

        pixst   , \numbytes, \firstreg, DST

.endm

generate_composite_function \

    pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \

    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \

    2, /* prefetch distance */ \

    over_n_8888_init, \

    nop_macro, /* newline */ \

    nop_macro, /* cleanup */ \

    over_n_8888_process_head, \

    over_n_8888_process_tail

/******************************************************************************/

Source code

Revision control

Copy as Markdown

Other Tools