Revision control
Copy as Markdown
Other Tools
/*
* (C) 2025 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
#include <botan/internal/shacal2.h>
#include <botan/internal/simd_avx2.h>
#include <botan/internal/simd_avx512.h>
namespace Botan {
namespace SHACAL2_AVX512_F {
namespace {
// NOLINTBEGIN(portability-simd-intrinsics)
/*
* 8x16 Transpose
*
* Convert from
*
* A00 B00 C00 ... H00
* A01 B01 C01 ... H01
* ..
* A15 B15 C15 ... H15
*
* with two blocks stored in each register, into
*
* A00 A01 ... A15
* B00 B01 ... B15
* ...
* H00 H01 ... H15
*/
BOTAN_FN_ISA_AVX512
void transpose_in(SIMD_16x32& B0,
SIMD_16x32& B1,
SIMD_16x32& B2,
SIMD_16x32& B3,
SIMD_16x32& B4,
SIMD_16x32& B5,
SIMD_16x32& B6,
SIMD_16x32& B7) {
auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
auto r0 = _mm512_unpacklo_epi64(t0, t2);
auto r1 = _mm512_unpackhi_epi64(t0, t2);
auto r2 = _mm512_unpacklo_epi64(t1, t3);
auto r3 = _mm512_unpackhi_epi64(t1, t3);
auto r4 = _mm512_unpacklo_epi64(t4, t6);
auto r5 = _mm512_unpackhi_epi64(t4, t6);
auto r6 = _mm512_unpacklo_epi64(t5, t7);
auto r7 = _mm512_unpackhi_epi64(t5, t7);
const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4));
B0 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4));
B1 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5));
B2 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6));
B3 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7));
B4 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4));
B5 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5));
B6 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6));
B7 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7));
}
BOTAN_FN_ISA_AVX512
void transpose_out(SIMD_16x32& B0,
SIMD_16x32& B1,
SIMD_16x32& B2,
SIMD_16x32& B3,
SIMD_16x32& B4,
SIMD_16x32& B5,
SIMD_16x32& B6,
SIMD_16x32& B7) {
auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
auto r0 = _mm512_unpacklo_epi64(t0, t2);
auto r1 = _mm512_unpackhi_epi64(t0, t2);
auto r2 = _mm512_unpacklo_epi64(t1, t3);
auto r3 = _mm512_unpackhi_epi64(t1, t3);
auto r4 = _mm512_unpacklo_epi64(t4, t6);
auto r5 = _mm512_unpackhi_epi64(t4, t6);
auto r6 = _mm512_unpacklo_epi64(t5, t7);
auto r7 = _mm512_unpackhi_epi64(t5, t7);
const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8));
auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4);
auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5);
auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6);
auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7);
auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4);
auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5);
auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6);
auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7);
B0 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100));
B1 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100));
B2 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110));
B3 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110));
B4 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100));
B5 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100));
B6 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110));
B7 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110));
}
// NOLINTEND(portability-simd-intrinsics)
template <typename SimdT>
void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Fwd(const SimdT& A,
const SimdT& B,
const SimdT& C,
SimdT& D,
const SimdT& E,
const SimdT& F,
const SimdT& G,
SimdT& H,
uint32_t RK) {
H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
D += H;
H += A.sigma0() + SimdT::majority(A, B, C);
}
template <typename SimdT>
void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Rev(const SimdT& A,
const SimdT& B,
const SimdT& C,
SimdT& D,
const SimdT& E,
const SimdT& F,
const SimdT& G,
SimdT& H,
uint32_t RK) {
H -= A.sigma0() + SimdT::majority(A, B, C);
D -= H;
H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
}
} // namespace
} // namespace SHACAL2_AVX512_F
size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
using namespace SHACAL2_AVX512_F;
size_t consumed = 0;
while(blocks >= 16) {
SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
transpose_in(A, B, C, D, E, F, G, H);
for(size_t r = 0; r != 64; r += 8) {
SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
}
transpose_out(A, B, C, D, E, F, G, H);
A.store_be(out + 64 * 0);
B.store_be(out + 64 * 1);
C.store_be(out + 64 * 2);
D.store_be(out + 64 * 3);
E.store_be(out + 64 * 4);
F.store_be(out + 64 * 5);
G.store_be(out + 64 * 6);
H.store_be(out + 64 * 7);
in += 16 * BLOCK_SIZE;
out += 16 * BLOCK_SIZE;
blocks -= 16;
consumed += 16;
}
while(blocks >= 8) {
SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
for(size_t r = 0; r != 64; r += 8) {
SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
}
SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
A.store_be(out + 32 * 0);
B.store_be(out + 32 * 1);
C.store_be(out + 32 * 2);
D.store_be(out + 32 * 3);
E.store_be(out + 32 * 4);
F.store_be(out + 32 * 5);
G.store_be(out + 32 * 6);
H.store_be(out + 32 * 7);
in += 8 * BLOCK_SIZE;
out += 8 * BLOCK_SIZE;
blocks -= 8;
consumed += 8;
}
return consumed;
}
size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
using namespace SHACAL2_AVX512_F;
size_t consumed = 0;
while(blocks >= 16) {
SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
transpose_in(A, B, C, D, E, F, G, H);
for(size_t r = 0; r != 64; r += 8) {
SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
}
transpose_out(A, B, C, D, E, F, G, H);
A.store_be(out + 64 * 0);
B.store_be(out + 64 * 1);
C.store_be(out + 64 * 2);
D.store_be(out + 64 * 3);
E.store_be(out + 64 * 4);
F.store_be(out + 64 * 5);
G.store_be(out + 64 * 6);
H.store_be(out + 64 * 7);
in += 16 * BLOCK_SIZE;
out += 16 * BLOCK_SIZE;
blocks -= 16;
consumed += 16;
}
while(blocks >= 8) {
SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
for(size_t r = 0; r != 64; r += 8) {
SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
}
SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
A.store_be(out + 32 * 0);
B.store_be(out + 32 * 1);
C.store_be(out + 32 * 2);
D.store_be(out + 32 * 3);
E.store_be(out + 32 * 4);
F.store_be(out + 32 * 5);
G.store_be(out + 32 * 6);
H.store_be(out + 32 * 7);
in += 8 * BLOCK_SIZE;
out += 8 * BLOCK_SIZE;
blocks -= 8;
consumed += 8;
}
return consumed;
}
} // namespace Botan