shacal2_avx512.cpp

comm-central/third_party/botan/src/lib/block/shacal2/shacal2_avx512/shacal2_avx512.cpp

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

/*

* (C) 2025 Jack Lloyd

* Botan is released under the Simplified BSD License (see license.txt)

*/

#include <botan/internal/shacal2.h>

#include <botan/internal/simd_avx2.h>

#include <botan/internal/simd_avx512.h>

namespace Botan {

namespace SHACAL2_AVX512_F {

namespace {

// NOLINTBEGIN(portability-simd-intrinsics)

/*

* 8x16 Transpose

* Convert from

* A00 B00 C00 ... H00

* A01 B01 C01 ... H01

* ..

* A15 B15 C15 ... H15

* with two blocks stored in each register, into

* A00 A01 ... A15

* B00 B01 ... B15

* ...

* H00 H01 ... H15

*/

BOTAN_FN_ISA_AVX512

void transpose_in(SIMD_16x32& B0,

                  SIMD_16x32& B1,

                  SIMD_16x32& B2,

                  SIMD_16x32& B3,

                  SIMD_16x32& B4,

                  SIMD_16x32& B5,

                  SIMD_16x32& B6,

                  SIMD_16x32& B7) {

   auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());

   auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());

   auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());

   auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());

   auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());

   auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());

   auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());

   auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());

   auto r0 = _mm512_unpacklo_epi64(t0, t2);

   auto r1 = _mm512_unpackhi_epi64(t0, t2);

   auto r2 = _mm512_unpacklo_epi64(t1, t3);

   auto r3 = _mm512_unpackhi_epi64(t1, t3);

   auto r4 = _mm512_unpacklo_epi64(t4, t6);

   auto r5 = _mm512_unpackhi_epi64(t4, t6);

   auto r6 = _mm512_unpacklo_epi64(t5, t7);

   auto r7 = _mm512_unpackhi_epi64(t5, t7);

   const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);

   const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4));

   B0 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4));

   B1 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5));

   B2 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6));

   B3 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7));

   B4 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4));

   B5 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5));

   B6 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6));

   B7 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7));

BOTAN_FN_ISA_AVX512

void transpose_out(SIMD_16x32& B0,

                   SIMD_16x32& B1,

                   SIMD_16x32& B2,

                   SIMD_16x32& B3,

                   SIMD_16x32& B4,

                   SIMD_16x32& B5,

                   SIMD_16x32& B6,

                   SIMD_16x32& B7) {

   auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());

   auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());

   auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());

   auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());

   auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());

   auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());

   auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());

   auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());

   auto r0 = _mm512_unpacklo_epi64(t0, t2);

   auto r1 = _mm512_unpackhi_epi64(t0, t2);

   auto r2 = _mm512_unpacklo_epi64(t1, t3);

   auto r3 = _mm512_unpackhi_epi64(t1, t3);

   auto r4 = _mm512_unpacklo_epi64(t4, t6);

   auto r5 = _mm512_unpackhi_epi64(t4, t6);

   auto r6 = _mm512_unpacklo_epi64(t5, t7);

   auto r7 = _mm512_unpackhi_epi64(t5, t7);

   const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);

   const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8));

   auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4);

   auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5);

   auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6);

   auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7);

   auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4);

   auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5);

   auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6);

   auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7);

   B0 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100));

   B1 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100));

   B2 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110));

   B3 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110));

   B4 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100));

   B5 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100));

   B6 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110));

   B7 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110));

// NOLINTEND(portability-simd-intrinsics)

template <typename SimdT>

void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Fwd(const SimdT& A,

                                                        const SimdT& B,

                                                        const SimdT& C,

                                                        SimdT& D,

                                                        const SimdT& E,

                                                        const SimdT& F,

                                                        const SimdT& G,

                                                        SimdT& H,

                                                        uint32_t RK) {

   H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);

   D += H;

   H += A.sigma0() + SimdT::majority(A, B, C);

template <typename SimdT>

void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Rev(const SimdT& A,

                                                        const SimdT& B,

                                                        const SimdT& C,

                                                        SimdT& D,

                                                        const SimdT& E,

                                                        const SimdT& F,

                                                        const SimdT& G,

                                                        SimdT& H,

                                                        uint32_t RK) {

   H -= A.sigma0() + SimdT::majority(A, B, C);

   D -= H;

   H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);

}  // namespace

}  // namespace SHACAL2_AVX512_F

size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {

   using namespace SHACAL2_AVX512_F;

   size_t consumed = 0;

   while(blocks >= 16) {

      SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);

      SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);

      SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);

      SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);

      SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);

      SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);

      SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);

      SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);

      transpose_in(A, B, C, D, E, F, G, H);

      for(size_t r = 0; r != 64; r += 8) {

         SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);

         SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);

         SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);

         SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);

         SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);

         SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);

         SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);

         SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);

      transpose_out(A, B, C, D, E, F, G, H);

      A.store_be(out + 64 * 0);

      B.store_be(out + 64 * 1);

      C.store_be(out + 64 * 2);

      D.store_be(out + 64 * 3);

      E.store_be(out + 64 * 4);

      F.store_be(out + 64 * 5);

      G.store_be(out + 64 * 6);

      H.store_be(out + 64 * 7);

      in += 16 * BLOCK_SIZE;

      out += 16 * BLOCK_SIZE;

      blocks -= 16;

      consumed += 16;

   while(blocks >= 8) {

      SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);

      SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);

      SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);

      SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);

      SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);

      SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);

      SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);

      SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);

      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);

      for(size_t r = 0; r != 64; r += 8) {

         SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);

         SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);

         SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);

         SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);

         SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);

         SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);

         SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);

         SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);

      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);

      A.store_be(out + 32 * 0);

      B.store_be(out + 32 * 1);

      C.store_be(out + 32 * 2);

      D.store_be(out + 32 * 3);

      E.store_be(out + 32 * 4);

      F.store_be(out + 32 * 5);

      G.store_be(out + 32 * 6);

      H.store_be(out + 32 * 7);

      in += 8 * BLOCK_SIZE;

      out += 8 * BLOCK_SIZE;

      blocks -= 8;

      consumed += 8;

   return consumed;

size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {

   using namespace SHACAL2_AVX512_F;

   size_t consumed = 0;

   while(blocks >= 16) {

      SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);

      SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);

      SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);

      SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);

      SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);

      SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);

      SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);

      SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);

      transpose_in(A, B, C, D, E, F, G, H);

      for(size_t r = 0; r != 64; r += 8) {

         SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);

         SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);

         SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);

         SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);

         SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);

         SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);

         SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);

         SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);

      transpose_out(A, B, C, D, E, F, G, H);

      A.store_be(out + 64 * 0);

      B.store_be(out + 64 * 1);

      C.store_be(out + 64 * 2);

      D.store_be(out + 64 * 3);

      E.store_be(out + 64 * 4);

      F.store_be(out + 64 * 5);

      G.store_be(out + 64 * 6);

      H.store_be(out + 64 * 7);

      in += 16 * BLOCK_SIZE;

      out += 16 * BLOCK_SIZE;

      blocks -= 16;

      consumed += 16;

   while(blocks >= 8) {

      SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);

      SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);

      SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);

      SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);

      SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);

      SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);

      SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);

      SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);

      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);

      for(size_t r = 0; r != 64; r += 8) {

         SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);

         SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);

         SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);

         SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);

         SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);

         SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);

         SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);

         SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);

      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);

      A.store_be(out + 32 * 0);

      B.store_be(out + 32 * 1);

      C.store_be(out + 32 * 2);

      D.store_be(out + 32 * 3);

      E.store_be(out + 32 * 4);

      F.store_be(out + 32 * 5);

      G.store_be(out + 32 * 6);

      H.store_be(out + 32 * 7);

      in += 8 * BLOCK_SIZE;

      out += 8 * BLOCK_SIZE;

      blocks -= 8;

      consumed += 8;

   return consumed;

}  // namespace Botan