simd_avx512.h - mozsearch

/*

* (C) 2023 Jack Lloyd

* Botan is released under the Simplified BSD License (see license.txt)

*/

#ifndef BOTAN_SIMD_AVX512_H_

#define BOTAN_SIMD_AVX512_H_

#include <botan/compiler.h>

#include <botan/types.h>

#include <botan/internal/isa_extn.h>

#include <immintrin.h>

namespace Botan {

class SIMD_16x32 final {

   public:

      SIMD_16x32& operator=(const SIMD_16x32& other) = default;

      SIMD_16x32(const SIMD_16x32& other) = default;

      SIMD_16x32& operator=(SIMD_16x32&& other) = default;

      SIMD_16x32(SIMD_16x32&& other) = default;

      BOTAN_FN_ISA_AVX512

      BOTAN_FORCE_INLINE SIMD_16x32() { m_avx512 = _mm512_setzero_si512(); }

      BOTAN_FN_ISA_AVX512

      explicit SIMD_16x32(const uint32_t B[16]) { m_avx512 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(B)); }

      BOTAN_FN_ISA_AVX512

      explicit SIMD_16x32(uint32_t B0,

                          uint32_t B1,

                          uint32_t B2,

                          uint32_t B3,

                          uint32_t B4,

                          uint32_t B5,

                          uint32_t B6,

                          uint32_t B7,

                          uint32_t B8,

                          uint32_t B9,

                          uint32_t BA,

                          uint32_t BB,

                          uint32_t BC,

                          uint32_t BD,

                          uint32_t BE,

                          uint32_t BF) {

         m_avx512 = _mm512_set_epi32(BF, BE, BD, BC, BB, BA, B9, B8, B7, B6, B5, B4, B3, B2, B1, B0);

      BOTAN_FN_ISA_AVX512

      static SIMD_16x32 splat(uint32_t B) { return SIMD_16x32(_mm512_set1_epi32(B)); }

      BOTAN_FN_ISA_AVX512

      static SIMD_16x32 load_le(const uint8_t* in) {

         return SIMD_16x32(_mm512_loadu_si512(reinterpret_cast<const __m512i*>(in)));

      BOTAN_FN_ISA_AVX512

      static SIMD_16x32 load_be(const uint8_t* in) { return load_le(in).bswap(); }

      BOTAN_FN_ISA_AVX512

      void store_le(uint8_t out[]) const { _mm512_storeu_si512(reinterpret_cast<__m512i*>(out), m_avx512); }

      BOTAN_FN_ISA_AVX512

      void store_be(uint8_t out[]) const { bswap().store_le(out); }

      template <size_t ROT>

      BOTAN_FN_ISA_AVX512 SIMD_16x32 rotl() const

         requires(ROT > 0 && ROT < 32)

         return SIMD_16x32(_mm512_rol_epi32(m_avx512, ROT));

      template <size_t ROT>

      BOTAN_FN_ISA_AVX512 SIMD_16x32 rotr() const {

         return this->rotl<32 - ROT>();

      SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma0() const {

         const SIMD_16x32 rot1 = this->rotr<2>();

         const SIMD_16x32 rot2 = this->rotr<13>();

         const SIMD_16x32 rot3 = this->rotr<22>();

         return rot1 ^ rot2 ^ rot3;

      SIMD_16x32 BOTAN_FN_ISA_AVX512 sigma1() const {

         const SIMD_16x32 rot1 = this->rotr<6>();

         const SIMD_16x32 rot2 = this->rotr<11>();

         const SIMD_16x32 rot3 = this->rotr<25>();

         return rot1 ^ rot2 ^ rot3;

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 operator+(const SIMD_16x32& other) const {

         SIMD_16x32 retval(*this);

         retval += other;

         return retval;

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 operator-(const SIMD_16x32& other) const {

         SIMD_16x32 retval(*this);

         retval -= other;

         return retval;

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 operator^(const SIMD_16x32& other) const {

         SIMD_16x32 retval(*this);

         retval ^= other;

         return retval;

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 operator|(const SIMD_16x32& other) const {

         SIMD_16x32 retval(*this);

         retval |= other;

         return retval;

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 operator&(const SIMD_16x32& other) const {

         SIMD_16x32 retval(*this);

         retval &= other;

         return retval;

      BOTAN_FN_ISA_AVX512

      void operator+=(const SIMD_16x32& other) { m_avx512 = _mm512_add_epi32(m_avx512, other.m_avx512); }

      BOTAN_FN_ISA_AVX512

      void operator-=(const SIMD_16x32& other) { m_avx512 = _mm512_sub_epi32(m_avx512, other.m_avx512); }

      BOTAN_FN_ISA_AVX512

      void operator^=(const SIMD_16x32& other) { m_avx512 = _mm512_xor_si512(m_avx512, other.m_avx512); }

      BOTAN_FN_ISA_AVX512

      void operator^=(uint32_t other) { *this ^= SIMD_16x32::splat(other); }

      BOTAN_FN_ISA_AVX512

      void operator|=(const SIMD_16x32& other) { m_avx512 = _mm512_or_si512(m_avx512, other.m_avx512); }

      BOTAN_FN_ISA_AVX512

      void operator&=(const SIMD_16x32& other) { m_avx512 = _mm512_and_si512(m_avx512, other.m_avx512); }

      template <int SHIFT>

      BOTAN_FN_ISA_AVX512 SIMD_16x32 shl() const {

         return SIMD_16x32(_mm512_slli_epi32(m_avx512, SHIFT));

      template <int SHIFT>

      BOTAN_FN_ISA_AVX512 SIMD_16x32 shr() const {

         return SIMD_16x32(_mm512_srli_epi32(m_avx512, SHIFT));

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 operator~() const { return SIMD_16x32(_mm512_xor_si512(m_avx512, _mm512_set1_epi32(0xFFFFFFFF))); }

      // (~reg) & other

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 andc(const SIMD_16x32& other) const {

         return SIMD_16x32(_mm512_andnot_si512(m_avx512, other.m_avx512));

      template <uint8_t TBL>

      BOTAN_FN_ISA_AVX512 static SIMD_16x32 ternary_fn(const SIMD_16x32& a, const SIMD_16x32& b, const SIMD_16x32& c) {

         return _mm512_ternarylogic_epi32(a.raw(), b.raw(), c.raw(), TBL);

      BOTAN_FN_ISA_AVX512

      SIMD_16x32 bswap() const {

         const uint8_t BSWAP_MASK[64] = {

            3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8,  15, 14, 13, 12, 19, 18, 17, 16, 23, 22,

            21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40,

            47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60,

};

         const __m512i bswap = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(BSWAP_MASK));

         const __m512i output = _mm512_shuffle_epi8(m_avx512, bswap);

         return SIMD_16x32(output);

      BOTAN_FN_ISA_AVX512

      static void transpose(SIMD_16x32& B0, SIMD_16x32& B1, SIMD_16x32& B2, SIMD_16x32& B3) {

         const __m512i T0 = _mm512_unpacklo_epi32(B0.m_avx512, B1.m_avx512);

         const __m512i T1 = _mm512_unpacklo_epi32(B2.m_avx512, B3.m_avx512);

         const __m512i T2 = _mm512_unpackhi_epi32(B0.m_avx512, B1.m_avx512);

         const __m512i T3 = _mm512_unpackhi_epi32(B2.m_avx512, B3.m_avx512);

         B0.m_avx512 = _mm512_unpacklo_epi64(T0, T1);

         B1.m_avx512 = _mm512_unpackhi_epi64(T0, T1);

         B2.m_avx512 = _mm512_unpacklo_epi64(T2, T3);

         B3.m_avx512 = _mm512_unpackhi_epi64(T2, T3);

      BOTAN_FN_ISA_AVX512

      static void transpose(SIMD_16x32& B0,

                            SIMD_16x32& B1,

                            SIMD_16x32& B2,

                            SIMD_16x32& B3,

                            SIMD_16x32& B4,

                            SIMD_16x32& B5,

                            SIMD_16x32& B6,

                            SIMD_16x32& B7,

                            SIMD_16x32& B8,

                            SIMD_16x32& B9,

                            SIMD_16x32& BA,

                            SIMD_16x32& BB,

                            SIMD_16x32& BC,

                            SIMD_16x32& BD,

                            SIMD_16x32& BE,

                            SIMD_16x32& BF) {

         auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());

         auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());

         auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());

         auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());

         auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());

         auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());

         auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());

         auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());

         auto t8 = _mm512_unpacklo_epi32(B8.raw(), B9.raw());

         auto t9 = _mm512_unpackhi_epi32(B8.raw(), B9.raw());

         auto ta = _mm512_unpacklo_epi32(BA.raw(), BB.raw());

         auto tb = _mm512_unpackhi_epi32(BA.raw(), BB.raw());

         auto tc = _mm512_unpacklo_epi32(BC.raw(), BD.raw());

         auto td = _mm512_unpackhi_epi32(BC.raw(), BD.raw());

         auto te = _mm512_unpacklo_epi32(BE.raw(), BF.raw());

         auto tf = _mm512_unpackhi_epi32(BE.raw(), BF.raw());

         auto r0 = _mm512_unpacklo_epi64(t0, t2);

         auto r1 = _mm512_unpackhi_epi64(t0, t2);

         auto r2 = _mm512_unpacklo_epi64(t1, t3);

         auto r3 = _mm512_unpackhi_epi64(t1, t3);

         auto r4 = _mm512_unpacklo_epi64(t4, t6);

         auto r5 = _mm512_unpackhi_epi64(t4, t6);

         auto r6 = _mm512_unpacklo_epi64(t5, t7);

         auto r7 = _mm512_unpackhi_epi64(t5, t7);

         auto r8 = _mm512_unpacklo_epi64(t8, ta);

         auto r9 = _mm512_unpackhi_epi64(t8, ta);

         auto ra = _mm512_unpacklo_epi64(t9, tb);

         auto rb = _mm512_unpackhi_epi64(t9, tb);

         auto rc = _mm512_unpacklo_epi64(tc, te);

         auto rd = _mm512_unpackhi_epi64(tc, te);

         auto re = _mm512_unpacklo_epi64(td, tf);

         auto rf = _mm512_unpackhi_epi64(td, tf);

         t0 = _mm512_shuffle_i32x4(r0, r4, 0x88);

         t1 = _mm512_shuffle_i32x4(r1, r5, 0x88);

         t2 = _mm512_shuffle_i32x4(r2, r6, 0x88);

         t3 = _mm512_shuffle_i32x4(r3, r7, 0x88);

         t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd);

         t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd);

         t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd);

         t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd);

         t8 = _mm512_shuffle_i32x4(r8, rc, 0x88);

         t9 = _mm512_shuffle_i32x4(r9, rd, 0x88);

         ta = _mm512_shuffle_i32x4(ra, re, 0x88);

         tb = _mm512_shuffle_i32x4(rb, rf, 0x88);

         tc = _mm512_shuffle_i32x4(r8, rc, 0xdd);

         td = _mm512_shuffle_i32x4(r9, rd, 0xdd);

         te = _mm512_shuffle_i32x4(ra, re, 0xdd);

         tf = _mm512_shuffle_i32x4(rb, rf, 0xdd);

         B0.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0x88);

         B1.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0x88);

         B2.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0x88);

         B3.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0x88);

         B4.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0x88);

         B5.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0x88);

         B6.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0x88);

         B7.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0x88);

         B8.m_avx512 = _mm512_shuffle_i32x4(t0, t8, 0xdd);

         B9.m_avx512 = _mm512_shuffle_i32x4(t1, t9, 0xdd);

         BA.m_avx512 = _mm512_shuffle_i32x4(t2, ta, 0xdd);

         BB.m_avx512 = _mm512_shuffle_i32x4(t3, tb, 0xdd);

         BC.m_avx512 = _mm512_shuffle_i32x4(t4, tc, 0xdd);

         BD.m_avx512 = _mm512_shuffle_i32x4(t5, td, 0xdd);

         BE.m_avx512 = _mm512_shuffle_i32x4(t6, te, 0xdd);

         BF.m_avx512 = _mm512_shuffle_i32x4(t7, tf, 0xdd);

      BOTAN_FN_ISA_AVX512

      static SIMD_16x32 choose(const SIMD_16x32& mask, const SIMD_16x32& a, const SIMD_16x32& b) {

         return SIMD_16x32::ternary_fn<0xca>(mask, a, b);

      BOTAN_FN_ISA_AVX512

      static SIMD_16x32 majority(const SIMD_16x32& x, const SIMD_16x32& y, const SIMD_16x32& z) {

         return SIMD_16x32::ternary_fn<0xe8>(x, y, z);

      BOTAN_FN_ISA_AVX512 static void zero_registers() {

         // Unfortunately this only zeros zmm0-zmm15 and not zmm16-zmm32

         _mm256_zeroall();

      __m512i BOTAN_FN_ISA_AVX512 raw() const { return m_avx512; }

      BOTAN_FN_ISA_AVX512

      SIMD_16x32(__m512i x) : m_avx512(x) {}

   private:

      __m512i m_avx512;

};

template <size_t R>

inline SIMD_16x32 rotl(SIMD_16x32 input) {

   return input.rotl<R>();

template <size_t R>

inline SIMD_16x32 rotr(SIMD_16x32 input) {

   return input.rotr<R>();

// For Serpent:

template <size_t S>

inline SIMD_16x32 shl(SIMD_16x32 input) {

   return input.shl<S>();

}  // namespace Botan

#endif

Revision control

Copy as Markdown

Other Tools