Revision control
Copy as Markdown
Other Tools
/*
* (C) 2026 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
#include <botan/internal/poly1305.h>
#include <botan/internal/isa_extn.h>
#include <immintrin.h>
namespace Botan {
namespace {
// NOLINTBEGIN(portability-simd-intrinsics)
class SIMD_8x44 final {
public:
BOTAN_FN_ISA_AVX512 SIMD_8x44() : m_v(_mm512_setzero_si512()) {}
static BOTAN_FN_ISA_AVX512 SIMD_8x44 splat(uint64_t x) { return SIMD_8x44(_mm512_set1_epi64(x)); }
static BOTAN_FN_ISA_AVX512 SIMD_8x44 load(const void* p) {
return SIMD_8x44(_mm512_loadu_si512(reinterpret_cast<const __m512i*>(p)));
}
BOTAN_FN_ISA_AVX512 SIMD_8x44(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) :
SIMD_8x44(_mm512_set_epi64(e7, e6, e5, e4, e3, e2, e1, e0)) {}
// Permute across two vectors using index vector
static BOTAN_FN_ISA_AVX512 SIMD_8x44 permute2(const SIMD_8x44& idx, const SIMD_8x44& a, const SIMD_8x44& b) {
return SIMD_8x44(_mm512_permutex2var_epi64(a.m_v, idx.m_v, b.m_v));
}
static BOTAN_FN_ISA_AVX512 SIMD_8x44 permute3(
const SIMD_8x44& idx0, const SIMD_8x44& idx1, const SIMD_8x44& a, const SIMD_8x44& b, const SIMD_8x44& c) {
return SIMD_8x44::permute2(idx1, SIMD_8x44::permute2(idx0, a, b), c);
}
// VBMI2 double shift right: concatenate (b:a) and shift right by count
template <int COUNT>
static BOTAN_FN_ISA_AVX512 SIMD_8x44 shrdi(const SIMD_8x44& a, const SIMD_8x44& b) {
return SIMD_8x44(_mm512_shrdi_epi64(a.m_v, b.m_v, COUNT));
}
BOTAN_FN_ISA_AVX512 SIMD_8x44 add_lane_zero(uint64_t b) {
return SIMD_8x44(_mm512_mask_add_epi64(m_v, 0x01, m_v, _mm512_set1_epi64(b)));
}
// IFMA: accumulator += (a * b) low 52 bits
BOTAN_FN_ISA_AVX512 SIMD_8x44& ifma_lo(const SIMD_8x44& a, const SIMD_8x44& b) {
m_v = _mm512_madd52lo_epu64(m_v, a.m_v, b.m_v);
return *this;
}
// IFMA: accumulator += (a * b) high 52 bits
BOTAN_FN_ISA_AVX512 SIMD_8x44& ifma_hi(const SIMD_8x44& a, const SIMD_8x44& b) {
m_v = _mm512_madd52hi_epu64(m_v, a.m_v, b.m_v);
return *this;
}
// Multiply by 20: 20*x = (x << 4) + (x << 2)
BOTAN_FN_ISA_AVX512 SIMD_8x44 mul_20() const {
return SIMD_8x44(_mm512_add_epi64(_mm512_slli_epi64(m_v, 4), _mm512_slli_epi64(m_v, 2)));
}
template <size_t S>
BOTAN_FN_ISA_AVX512 SIMD_8x44 shr() const {
return SIMD_8x44(_mm512_srli_epi64(m_v, S));
}
BOTAN_FN_ISA_AVX512 uint64_t horizontal_add() const { return _mm512_reduce_add_epi64(m_v); }
BOTAN_FN_ISA_AVX512 SIMD_8x44 operator&(const SIMD_8x44& other) const {
return SIMD_8x44(_mm512_and_si512(m_v, other.m_v));
}
BOTAN_FN_ISA_AVX512 SIMD_8x44 operator|(const SIMD_8x44& other) const {
return SIMD_8x44(_mm512_or_si512(m_v, other.m_v));
}
static BOTAN_FN_ISA_AVX512 void interleave_3x8(SIMD_8x44& r0, SIMD_8x44& r1, SIMD_8x44& r2) {
const auto idx1_z0 = SIMD_8x44(0, 3, 6, 9, 12, 15, -1, -1);
const auto idx2_z0 = SIMD_8x44(7, 6, 5, 4, 3, 2, 10, 13);
const auto idx1_z1 = SIMD_8x44(1, 4, 7, 10, 13, -1, -1, -1);
const auto idx2_z1 = SIMD_8x44(7, 6, 5, 4, 3, 8, 11, 14);
const auto idx1_z2 = SIMD_8x44(2, 5, 8, 11, 14, -1, -1, -1);
const auto idx2_z2 = SIMD_8x44(7, 6, 5, 4, 3, 9, 12, 15);
// NOLINTBEGIN(*-suspicious-call-argument)
auto z0 = SIMD_8x44::permute3(idx1_z0, idx2_z0, r0, r1, r2);
auto z1 = SIMD_8x44::permute3(idx1_z1, idx2_z1, r0, r1, r2);
auto z2 = SIMD_8x44::permute3(idx1_z2, idx2_z2, r0, r1, r2);
// NOLINTEND(*-suspicious-call-argument)
r0 = z0;
r1 = z1;
r2 = z2;
}
private:
__m512i BOTAN_FN_ISA_AVX512 raw() const { return m_v; }
explicit BOTAN_FN_ISA_AVX512 SIMD_8x44(__m512i v) : m_v(v) {}
__m512i m_v;
};
// NOLINTEND(portability-simd-intrinsics)
} // namespace
/*
* Process 8 blocks at a time using AVX-512 IFMA
* h = (h + m[0]) * r^8 + m[1] * r^7 + ... + m[7] * r
*/
size_t BOTAN_FN_ISA_AVX512 Poly1305::poly1305_avx512_blocks(secure_vector<uint64_t>& X,
const uint8_t* m,
size_t blocks) {
constexpr uint64_t M44 = 0xFFFFFFFFFFF;
constexpr uint64_t M42 = 0x3FFFFFFFFFF;
constexpr uint64_t hibit64 = static_cast<uint64_t>(1) << 40;
if(blocks < 8) {
return 0;
}
const size_t original_blocks = blocks;
// Load h from state
uint64_t h0 = X[2];
uint64_t h1 = X[3];
uint64_t h2 = X[4];
SIMD_8x44 r0 = SIMD_8x44::load(&X[5]);
SIMD_8x44 r1 = SIMD_8x44::load(&X[5 + 8]);
SIMD_8x44 r2 = SIMD_8x44::load(&X[5 + 2 * 8]);
SIMD_8x44::interleave_3x8(r0, r1, r2);
const auto s1 = r1.mul_20();
const auto s2 = r2.mul_20();
// Constants for vectorized message loading
// Deinterleave indices: separate low (t0) and high (t1) 64-bit halves of each 128-bit block
// Memory layout: [t0_0, t1_0, t0_1, t1_1, ...] -> want [t0_0..t0_7] and [t1_0..t1_7]
const auto idx_lo = SIMD_8x44(14, 12, 10, 8, 6, 4, 2, 0);
const auto idx_hi = SIMD_8x44(15, 13, 11, 9, 7, 5, 3, 1);
const auto mask44 = SIMD_8x44::splat(M44);
const auto mask42 = SIMD_8x44::splat(M42);
const auto hibit = SIMD_8x44::splat(hibit64);
while(blocks >= 8) {
// Load 8 message blocks (128 bytes) with two 512-bit loads
const auto data0 = SIMD_8x44::load(m);
const auto data1 = SIMD_8x44::load(m + 64);
// Deinterleave: separate low and high 64-bit halves of each 128-bit block
const auto t0 = SIMD_8x44::permute2(idx_lo, data0, data1);
const auto t1 = SIMD_8x44::permute2(idx_hi, data0, data1);
// Convert to radix 2^44 representation using VBMI2
// limb0 = t0[43:0]
// limb1 = t1[23:0]:t0[63:44] (bits 44-87 of block)
// limb2 = t1[63:24] | hibit (bits 88-129 of block + high bit)
auto m0 = t0 & mask44;
auto m1 = SIMD_8x44::shrdi<44>(t0, t1) & mask44;
auto m2 = (t1.shr<24>() & mask42) | hibit;
// Add h to first block
m0 = m0.add_lane_zero(h0);
m1 = m1.add_lane_zero(h1);
m2 = m2.add_lane_zero(h2);
// d0 = m0*r0 + m1*s2 + m2*s1
const SIMD_8x44 d0_lo = SIMD_8x44().ifma_lo(m0, r0).ifma_lo(m1, s2).ifma_lo(m2, s1);
const SIMD_8x44 d0_hi = SIMD_8x44().ifma_hi(m0, r0).ifma_hi(m1, s2).ifma_hi(m2, s1);
// d1 = m0*r1 + m1*r0 + m2*s2
const SIMD_8x44 d1_lo = SIMD_8x44().ifma_lo(m0, r1).ifma_lo(m1, r0).ifma_lo(m2, s2);
const SIMD_8x44 d1_hi = SIMD_8x44().ifma_hi(m0, r1).ifma_hi(m1, r0).ifma_hi(m2, s2);
// d2 = m0*r2 + m1*r1 + m2*r0
const SIMD_8x44 d2_lo = SIMD_8x44().ifma_lo(m0, r2).ifma_lo(m1, r1).ifma_lo(m2, r0);
const SIMD_8x44 d2_hi = SIMD_8x44().ifma_hi(m0, r2).ifma_hi(m1, r1).ifma_hi(m2, r0);
// Horizontal adds can't overflow - at most 8*3*(2**52-1) ~= 2**57
const uint64_t sum0_lo = d0_lo.horizontal_add();
const uint64_t sum0_hi = d0_hi.horizontal_add();
uint64_t sum1_lo = d1_lo.horizontal_add();
const uint64_t sum1_hi = d1_hi.horizontal_add();
uint64_t sum2_lo = d2_lo.horizontal_add();
const uint64_t sum2_hi = d2_hi.horizontal_add();
h0 = sum0_lo & M44;
sum1_lo += (sum0_lo >> 44) + (sum0_hi << 8);
h1 = sum1_lo & M44;
sum2_lo += (sum1_lo >> 44) + (sum1_hi << 8);
h2 = sum2_lo & M42;
// Wrap-around reduction: carry * 5 goes back to h0
uint64_t carry = ((sum2_lo >> 42) + (sum2_hi << 10)) * 5;
carry += h0;
h0 = carry & M44;
carry >>= 44;
carry += h1;
h1 = carry & M44;
carry >>= 44;
h2 += carry;
m += 8 * 16;
blocks -= 8;
}
X[2] = h0;
X[3] = h1;
X[4] = h2;
return (original_blocks - blocks);
}
} // namespace Botan