Revision control
Copy as Markdown
Other Tools
/* asm-common-aarch64.h - Poly1305 macros for ARMv8/AArch64 assembly
*
* Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
*/
#ifndef GCRY_ASM_POLY1305_AARCH64_H
#define GCRY_ASM_POLY1305_AARCH64_H
#include "asm-common-aarch64.h"
#ifdef __AARCH64EL__
#define le_to_host(reg) /*_*/
#else
#define le_to_host(reg) rev reg, reg;
#endif
/**********************************************************************
poly1305 for stitched chacha20-poly1305 Aarch64 implementations
**********************************************************************/
#define POLY_RSTATE x8
#define POLY_RSRC x9
#define POLY_R_H0 x10
#define POLY_R_H1 x11
#define POLY_R_H2 x12
#define POLY_R_H2d w12
#define POLY_R_R0 x13
#define POLY_R_R1 x14
#define POLY_R_R1_MUL5 x15
#define POLY_R_X0_HI x16
#define POLY_R_X0_LO x17
#define POLY_R_X1_HI x19
#define POLY_R_X1_LO x20
#define POLY_R_ONE x21
#define POLY_R_ONEd w21
#define POLY_TMP0 x22
#define POLY_TMP1 x23
#define POLY_TMP2 x24
#define POLY_TMP3 x25
#define POLY_CHACHA_ROUND x26
#define POLY_S_R0 (4 * 4 + 0 * 8)
#define POLY_S_R1 (4 * 4 + 1 * 8)
#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8)
#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8)
#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8)
#define POLY1305_PUSH_REGS() \
stp x19, x20, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(19, 0); \
CFI_REG_ON_STACK(20, 8); \
stp x21, x22, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(21, 0); \
CFI_REG_ON_STACK(22, 8); \
stp x23, x24, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(23, 0); \
CFI_REG_ON_STACK(24, 8); \
stp x25, x26, [sp, #-16]!; \
CFI_ADJUST_CFA_OFFSET(16); \
CFI_REG_ON_STACK(25, 0); \
CFI_REG_ON_STACK(26, 8);
#define POLY1305_POP_REGS() \
ldp x25, x26, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x25); \
CFI_RESTORE(x26); \
ldp x23, x24, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x23); \
CFI_RESTORE(x24); \
ldp x21, x22, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x21); \
CFI_RESTORE(x22); \
ldp x19, x20, [sp], #16; \
CFI_ADJUST_CFA_OFFSET(-16); \
CFI_RESTORE(x19); \
CFI_RESTORE(x20);
#define POLY1305_LOAD_STATE() \
ldr POLY_R_R1, [POLY_RSTATE, #(POLY_S_R1)]; \
ldr POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
ldr POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
ldr POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)]; \
ldr POLY_R_R0, [POLY_RSTATE, #(POLY_S_R0)]; \
add POLY_R_R1_MUL5, POLY_R_R1, POLY_R_R1, lsr #2; \
mov POLY_R_ONE, #1;
#define POLY1305_STORE_STATE() \
str POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
str POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
str POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)];
#define POLY1305_BLOCK_PART1(src_offset) \
/* a = h + m */ \
ldr POLY_TMP0, [POLY_RSRC, #((src_offset) + 0 * 8)];
#define POLY1305_BLOCK_PART2(src_offset) \
ldr POLY_TMP1, [POLY_RSRC, #((src_offset) + 1 * 8)];
#define POLY1305_BLOCK_PART3() \
le_to_host(POLY_TMP0);
#define POLY1305_BLOCK_PART4() \
le_to_host(POLY_TMP1);
#define POLY1305_BLOCK_PART5() \
adds POLY_R_H0, POLY_R_H0, POLY_TMP0;
#define POLY1305_BLOCK_PART6() \
adcs POLY_R_H1, POLY_R_H1, POLY_TMP1;
#define POLY1305_BLOCK_PART7() \
adc POLY_R_H2d, POLY_R_H2d, POLY_R_ONEd;
#define POLY1305_BLOCK_PART8() \
/* h = a * r (partial mod 2^130-5): */ \
mul POLY_R_X1_LO, POLY_R_H0, POLY_R_R1; /* lo: h0 * r1 */
#define POLY1305_BLOCK_PART9() \
mul POLY_TMP0, POLY_R_H1, POLY_R_R0; /* lo: h1 * r0 */
#define POLY1305_BLOCK_PART10() \
mul POLY_R_X0_LO, POLY_R_H0, POLY_R_R0; /* lo: h0 * r0 */
#define POLY1305_BLOCK_PART11() \
umulh POLY_R_X1_HI, POLY_R_H0, POLY_R_R1; /* hi: h0 * r1 */
#define POLY1305_BLOCK_PART12() \
adds POLY_R_X1_LO, POLY_R_X1_LO, POLY_TMP0;
#define POLY1305_BLOCK_PART13() \
umulh POLY_TMP1, POLY_R_H1, POLY_R_R0; /* hi: h1 * r0 */
#define POLY1305_BLOCK_PART14() \
mul POLY_TMP2, POLY_R_H1, POLY_R_R1_MUL5; /* lo: h1 * r1 mod 2^130-5 */
#define POLY1305_BLOCK_PART15() \
umulh POLY_R_X0_HI, POLY_R_H0, POLY_R_R0; /* hi: h0 * r0 */
#define POLY1305_BLOCK_PART16() \
adc POLY_R_X1_HI, POLY_R_X1_HI, POLY_TMP1;
#define POLY1305_BLOCK_PART17() \
umulh POLY_TMP3, POLY_R_H1, POLY_R_R1_MUL5; /* hi: h1 * r1 mod 2^130-5 */
#define POLY1305_BLOCK_PART18() \
adds POLY_R_X0_LO, POLY_R_X0_LO, POLY_TMP2;
#define POLY1305_BLOCK_PART19() \
mul POLY_R_H1, POLY_R_H2, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */
#define POLY1305_BLOCK_PART20() \
adc POLY_R_X0_HI, POLY_R_X0_HI, POLY_TMP3;
#define POLY1305_BLOCK_PART21() \
mul POLY_R_H2, POLY_R_H2, POLY_R_R0; /* h2 * r0 */
#define POLY1305_BLOCK_PART22() \
adds POLY_R_H1, POLY_R_H1, POLY_R_X1_LO;
#define POLY1305_BLOCK_PART23() \
adc POLY_R_H0, POLY_R_H2, POLY_R_X1_HI;
#define POLY1305_BLOCK_PART24() \
/* carry propagation */ \
and POLY_R_H2, POLY_R_H0, #3;
#define POLY1305_BLOCK_PART25() \
lsr POLY_R_H0, POLY_R_H0, #2;
#define POLY1305_BLOCK_PART26() \
add POLY_R_H0, POLY_R_H0, POLY_R_H0, lsl #2;
#define POLY1305_BLOCK_PART27() \
adds POLY_R_H0, POLY_R_H0, POLY_R_X0_LO;
#define POLY1305_BLOCK_PART28() \
adcs POLY_R_H1, POLY_R_H1, POLY_R_X0_HI;
#define POLY1305_BLOCK_PART29() \
adc POLY_R_H2d, POLY_R_H2d, wzr;
//#define TESTING_POLY1305_ASM
#ifdef TESTING_POLY1305_ASM
/* for testing only. */
.align 3
.globl _gcry_poly1305_aarch64_blocks1
ELF(.type _gcry_poly1305_aarch64_blocks1,%function;)
_gcry_poly1305_aarch64_blocks1:
/* input:
* x0: poly1305-state
* x1: src
* x2: nblks
*/
CFI_STARTPROC()
POLY1305_PUSH_REGS();
mov POLY_RSTATE, x0;
mov POLY_RSRC, x1;
POLY1305_LOAD_STATE();
.L_gcry_poly1305_aarch64_loop1:
POLY1305_BLOCK_PART1(0 * 16);
POLY1305_BLOCK_PART2(0 * 16);
add POLY_RSRC, POLY_RSRC, #16;
POLY1305_BLOCK_PART3();
POLY1305_BLOCK_PART4();
POLY1305_BLOCK_PART5();
POLY1305_BLOCK_PART6();
POLY1305_BLOCK_PART7();
POLY1305_BLOCK_PART8();
POLY1305_BLOCK_PART9();
POLY1305_BLOCK_PART10();
POLY1305_BLOCK_PART11();
POLY1305_BLOCK_PART12();
POLY1305_BLOCK_PART13();
POLY1305_BLOCK_PART14();
POLY1305_BLOCK_PART15();
POLY1305_BLOCK_PART16();
POLY1305_BLOCK_PART17();
POLY1305_BLOCK_PART18();
POLY1305_BLOCK_PART19();
POLY1305_BLOCK_PART20();
POLY1305_BLOCK_PART21();
POLY1305_BLOCK_PART22();
POLY1305_BLOCK_PART23();
POLY1305_BLOCK_PART24();
POLY1305_BLOCK_PART25();
POLY1305_BLOCK_PART26();
POLY1305_BLOCK_PART27();
POLY1305_BLOCK_PART28();
POLY1305_BLOCK_PART29();
subs x2, x2, #1;
b.ne .L_gcry_poly1305_aarch64_loop1;
POLY1305_STORE_STATE();
mov x0, #0;
POLY1305_POP_REGS();
ret;
CFI_ENDPROC()
ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;)
#endif
#endif /* GCRY_ASM_POLY1305_AARCH64_H */